php采集后的处理

?php/*** @name 采集后的处理.php* @date Sat Dec 22 02:07:45 CST
2007* @copyright 马永占(MyZ)* @author 马永占(MyZ) * @link
*///采集后的文件,然后那来进行处理.这里的东西让我抄了5本书,是哪的不方便提供,自己找找吧.header(”Content-Type:text/html;charset=utf8”);function
writer($content,$url){ $fp = fopen($url, ”ab”); fwrite($fp, $content);
fclose($fp); }//从1到136页的内容一次合并.这个是最爽的…for
($i=1;$i136;$i++) { $str =
file_get_contents(”./myz/”.$i.”.shtml”);
preg_match(/(h1)(.*?)(/h1)(.*?)(div class=artibody
id=artibody)(.*?)(/div)/s,$str,$arr); $arr[6] =
preg_replace(/(span[^]+.*?a[^]+)(.*?)(/a/span)/s,$2,preg_replace(/p|/p/,rn,$arr[6]));
$result =
rn————————————————rn————————————————rn————————————————rn.$i.—————-马永占的目录编号:.$arr[2].rn————————————————rn————————————————rn————————————————rn.$arr[6];
writer($result, ./myz/all.txt);}?

/**
*澳门新葡亰娱乐官网, @name 采集书.php
* @date Sun Mar 01 22:48:02 CST 2009
* @copyright 马永占(MyZ)
* @author 马永占(MyZ)
* @link
*/
//header(‘Content-Type:text/html;charset=utf8’);
header(‘Content-Type:text/html;charset=gb2312’);
error_reporting(E_ALL);
date_default_timezone_set(‘Asia/Shanghai’);
set_time_limit(0);
function writer($content,$url)
{
$fp = fopen($url, ‘ab’);
fwrite($fp, $content);
fclose($fp);
}
$folder = ‘2’; //文件夹
$book_base_url = ‘xxxxxxxxxxxxxxxxxxxxx’;
$book_url = ‘yyyyyyyyyyyyy.html’;
$main = file_get_contents($book_base_url.$book_url);
preg_match_all(‘/chapter_.*?.html/’, $main, $pages);
$pages = array_unique($pages[0]);
foreach ($pages as $value) {
writer(file_get_contents($book_base_url.$value),
‘./’.$folder.’/’.$value.’.txt’);
$str = file_get_contents(‘./’.$folder.’/’.$value.’.txt’);
//print_r($str);
preg_match(“/(

)(.*?)()(.*?)()(.*?)()/s”,$str,$arr); //print_r($arr);die(); $arr[6] = preg_replace(“/(]+>.*?]+>)(.*?)()/s”,”$2″,preg_replace(“/

|/”,”rn”,$arr[6]));
$result =
“rn————————————————rn————————————————rn————————————————rn—————-“.$arr[2].”rn————————————————rn————————————————rn————————————————rn”.$arr[6];
writer($result, ‘./’.$folder.’/new.txt’);
}
?>

You can leave a response, or trackback from your own site.

Leave a Reply

网站地图xml地图