|
用php来获取指定网页内容的实现代码,一般采集程序经常用的一些代码,这里只是一个简单的基础。更详细的资料可以参考php 采集成品,可以使用的源码,这样才能学会更多的东西。
参考别人想法变成自己的想法,你会发现慢慢下来以后你就拥有了临时解决很多问题的思路与方法。- <?php
- /*
- 功能:获取页面内容,存储下来阅读; lost63
- */
- Class GetUrl {
- var $url; //地址
- var $result; //结果
- var $content; //内容
- var $list; //列表
- function GetUrl($url) {
- $this->url = $url;
- $this->GetContent();
- $this->GetList();
- $this->FileSave();
- //print_r($this->list[2]);
-
- }
- private function GetContent() {
- $this->result = fopen($this->url, "r");
- while (!feof($this->result)) {
- $this->content.= fgets($this->result, 9999);
- }
- }
- private function GetList() {
- preg_match_all('/<a(.*?)href="(.*?)">(.*?)<\/a>/', $this->content, $this->list);
- $this->list[2] = array_unique($this->list[2]); //移除相同的值
- while (list($key, $value) = each($this->list[2])) {
- if (strpos($value, ".html") == 0 || strpos($value, "jiaocheng") == 0) {
- unset($this->list[2][$key]);
- } else {
- $this->list[2][$key] = substr($value, 0, strpos($value, ".html")) . ".html"; //去掉不需要的标签
-
- }
- }
- }
- private function FileSave() {
- foreach ($this->list[2] as $value) {
- $this->url = $value; //重新赋值
- $this->content = null;
- $this->GetContent(); //提取内容
- preg_match_all('/<title>(.*?)<\/title>/', $this->content, $files); //取标题
- $filename = $files[1][0] . ".html"; //存储名
- $content = $this->str_cut($this->content, 'http://pagead2.googlesyndication.com/pagead/show_ads.js', '<div id="article_detail">');
- $file = fopen($filename, "w");
- fwrite($file, $content);
- fclose($file);
- echo $filename . "保存 OK<br>\n";
- }
- }
- function str_cut($str, $start, $end) {
- $content = strstr($str, $start);
- $content = substr($content, strlen($start) , strpos($content, $end) - strlen($start));
- return $content;
- }
- }
- $w = new GetUrl("http://www.myolnet.com/forum.php?mod=forumdisplay&fid=45");
- ?>
复制代码 |
|