【原创教程,高手勿喷】利用php抓取本吧首页的帖子标题

2012 年 12 月 8 日5920

无语,随便回了,被认为**,删帖,实话实说而已,这功能采集用python写的话,只用几行代码,个人也写过采集软件,以下是Php部分采集代码,动态生成采集规则,实现过采集任何网站中的信息内容,再删除不回了,年轻人浮躁
class ControllerlankiibeginCaiji extends Controller {

public function index() {
$this->load->model('lankii/caijiSetting');

$caijiRule=$this->model_lankii_caijiSetting->getCaijiRule($this->request->get['ruleId']);

if(is_array($caijiRule[0])){
$this->data['caijiRule']=$caijiRule[0];
$this->load->model('lankii/caijiHttpClass');

$myhttp=$this->model_lankii_caijiHttpClass->doGet($caijiRule[0]['targetUrl']);
$lankiiHTML=mb_convert_encoding($myhttp, "UTF-8", $caijiRule[0]['targetSiteCharset']);

$urlGetAll="/<a(.*)href=\"?([^\"]*)\"?/i";
preg_match_all($urlGetAll,$lankiiHTML,$allUrl);

$lankiiHTML="";

//去除重复网址
if($caijiRule[0]['urlReplaceSame']){$allUrl[2]=array_flip(array_flip($allUrl[2])); }
$this->data['allUrl']=str_ireplace('../','',$allUrl[2]);

$urlCateReg=$this->replace($caijiRule[0]['urlCategoryReg']);
$urlCateReg='/'.$urlCateReg.'/i';

0 0