脚本宝典收集整理的这篇文章主要介绍了PHP制作百度词典查词采集器,脚本宝典觉得挺不错的,现在分享给大家,也给大家做个参考。
百度dict 采集样本
写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.PHP放出来,项目地址http://gIThub.COM/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~
PHP;"> @H_404_9@ 音标 * "pro" => 发音 * "example"=> 例句 * "explain"=> 简明释义 * "synonym"=> 同反义词 * "phrase" => 短语数组 * ) * */ public function content($word){ $this -> word = $word; $symbol = $this -> PRonounced(); $pro = $this->getSay(); $example = $this -> getExample(); $explain = $this -> getExplain(); $synonym = $this -> getSynonym(); $phrase = $this -> getPhrase(); $result = array( "symbol" => $symbol,//音标 "pro" => $pro,//发音 "example"=> $example,//例句 "explain"=> $explain,//简明释义 "synonym"=> $synonym,//同反义词 "phrase" => $phrase //短语数组 ); return $result; }/**
private function getContent(){
$useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
$ch = curl_init();
$url = "http://dict.baidu.com/s?wd=".$this->word;
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_USERAGENT,$useragent);
curl_setopt($ch,CURLOPT_RETURNtransfer,TRUE);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1);
curl_setopt($ch,CURLOPT_HTTPGET,1);
curl_setopt($ch,CURLOPT_AUTOREFERER,CURLOPT_HEADER,0);
curl_setopt($ch,CURLOPT_TIMEOUT,30);
$result = curl_exec($ch);
if (curl_errno($curl)) {
echo 'Errno'.curl_error($curl);
}
curl_close($ch);
return $result;
}
/**
private function Pronounced(){
$data = $this -> getContent();
preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced);
return array(
'en' => $pronounced[1][0],'us' => $pronounced[1][1]
);
}
/**
* <a href="https://www.js-code.com/tag/huoqu/" target="_blank" class="keywords">获取</a><a href="https://www.js-code.com/tag/baidu/" target="_blank" class="keywords">百度</a>翻译发音
* return array(英,美)
*
*/
private function getSay(){
$data = $this -> getContent();
preg_match_all("/url=\"(.*)\"/Ui",'us' => $pronounced[1][1]
);
}
/**
private function getExample(){
$str = "";
$data = $this -> getContent();
preg_match_all("/VAR example_data = (.*)\]\;/Us",$example);
$data1 = "[[[".ltrim($example[1][0],"[");
$data2 = explode("[[[",$data1);
$num = count(array_filter($data2));
foreach($data2 as $key => $value){
$data3 = explode("[[","[[".$value);
foreach ($data3 as $k => $v) {
preg_match_all("/\[\"(.*)\",/Us","[".$v,$match);
if(!empty($match[1])){
$str .= implode($match[1]," ")."@";
}
}
}
$data4 = trim($str,"@");
$data5 = explode("@",$data4);
$result = array_chunk($data5,2);
return $result;
}
/**
private function getExplain(){
$data = $this -> getContent();
preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\<div(\s+)class\=\"source\"\>/Us",$explain);
$r_data = $explain[1][0];
preg_match_all("/\<p\>\<h3\>(?P<adj>.*)\<\/h3\>\<span\>(?P<name>.*)\<\/span\>\<\/p\>/Us",$r_data,$a_data);
preg_match_all("/\<span\>(?P<tag>[^\>]+)\:\<a(\s+)href\=\"(.*)\"\>(?P<word>.*)\<\/a\>\<\/span\>/Us",$b_data);
$result = array();
foreach ($a_data["adj"] as $key => $value) {
$result[$value] = $a_data["name"][$key];
}
$word_b = array();
foreach ($b_data["tag"] as $key => $value) {
$word_b[$value] = strip_tags($b_data["word"][$key]);
}
$result_data = array("x" => $result,"b" => $word_b);
return $result_data;
}
/**
*/
private function getSynonym(){
$data = $this -> getContent();
preg_match_all("/id=\"en\-syn\-ant\"\>(.*)<div(\s+)class\=\"source\"&gt;/Us",$synonym);
$content = $synonym[1][0];
$data1 = explode("</dl>",$content);
$result = array();
$data2 = array();
foreach ($data1 as $key => $value) {
preg_match_all("/\<h3\>(?P<adj>.*)\ \;\<\/h3\>\<\/div\>\<div(\s+)class\=\"syn\-ant\-list\"\>\<ul\>(?<content>.*)\<\/ul\>/Us",$value,$r_data);
$data2[$key]["adj"] = $r_data["adj"];
$data2[$key]["content"] = $r_data["content"];
}
foreach ($data2 as $key => $value) {
foreach ($value["content"] as $k => $v) {
if(!empty($v)){
preg_match_all("/\<li\>\<p\>(?P<title>.*)\<\/p\>(?P<value>.*)\<\/li>/Us",$v,$v_data);
foreach ($v_data['title'] as $m => $d) {
$data = strip_tags(preg_replace("<>"," ",$v_data["value"][$m]));
$result[$key][$value["adj"][$k]][$d] = $data;
}
}
}
}
return $result;
}
/**
private function getPhrase(){
$num = self::$num;
$data = $this -> getContent();
preg_match_all("/id=\"en\-phrase\"\>(.*)\<div class\=\"source\"\>/Us",$phrase);
$data = explode("</dd>",$phrase[1][0]);
$data1 = array_slice($data,$num);
$result = array();
foreach ($data1 as $key => $value) {
$data2 = explode("</p>",$value);
$n = count($data2);
if($n<=3){
$result[str_replace("","",strip_tags($data2[0]))] = strip_tags($data2[1]);
}else{
$data3 = array_slice($data2,$n-1);
$data4 = array_slice($data2,2);
$res = array_diff($data3,$data4);
$data5 = array_chunk($res,2);
$key_value = trim(str_replace("",strip_tags($data4[0])));
$result[$key_value] = strip_tags($data4[1]);
foreach ($data5 as $key => $value) {
foreach ($value as $k => $v) {
$value[$k] = strip_tags($v);
}
$array = array($result[$key_value],$value);
if (array_key_exists($key_value,$result)){
$result[$key_value] = $array;
}
}
}
}
return $result;
}
/**
* 将数组转换为字符串
*
* @param array $data 数组
* @param bool $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,<a href="https://www.js-code.com/tag/mo/" target="_blank" class="keywords">默</a>认为1
* @return string 返回字符串,如果,data为空,则返回空
*/
private function array2string($data,$isformdata = 1) {
if($data == '') return '';
if($isformdata) $data = $this->new_stripslashes($data);
return addslashes(var_export($data,TRUE));
}
/**
* 返回经stripslashes处理过的字符串或数组
* @param $string 需要处理的字符串或数组
* @return mixed
*/
private function new_stripslashes($string) {
if(!is_array($string)) return stripslashes($string);
foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);
return $string;
}
}
// $word = new dict("exPress");
// $word ->content();
@H_419_19@
以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。
以上是脚本宝典为你收集整理的PHP制作百度词典查词采集器全部内容,希望文章能够帮你解决PHP制作百度词典查词采集器所遇到的问题。
本图文内容来源于网友网络收集整理提供,作为学习参考使用,版权属于原作者。
如您有任何意见或建议可联系处理。小编QQ:384754419,请注明来意。