27 十

PHP cURL和正则表达式的Email地址提取脚本(1)

27 10月 2013 |
作者 Chin-Hock Tan
字号
打印
电子邮件地址

PHP cURL和正则表达式的Email地址提取脚本

在本文里，我将解释如何使用PHP/cURL从网页提取Email地址。PHP脚本将运用正则表达式匹配HTML标签提取。

想想看，如果我们寄出电邮开头以“先生您好”或“老板您好”，那收信者多数会把我们的邮件当成垃圾处理。所以使用网络爬虫或蜘蛛采集Email地址时，我们也需提取相关的资料如姓名，电话号码，公司名称，职位等。把这些资料包括在邮件内容，收信者就会仔细阅读。

当然，请您不要滥用海量采集电子邮件地址的能力，胡乱发出垃圾邮件，令人反感的广告内容，违反版权法或干扰网络带宽。如果您惹祸上身，小弟没有势力救不了您，还是请个律师帮忙好。

首先，我们看看一个简单的电邮地址提取脚本。之前的HttpCurl类将会被使用。

<?php

define('EMAIL_PATTERN', '/([\s]*)([_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*([ ]+|)@([ ]+|)([a-zA-Z0-9-]+\.)+([a-zA-Z]{2,}))([\s]*)/i');

interface HttpScraper
{
	public function parse($body, $head);
}

class Scraper implements HttpScraper
{
	
	public function parse($body, $head)	{
	   if ($head == 200) {
		$p = preg_match_all(EMAIL_PATTERN, $body, $matches);
			if ($p) {
				foreach($matches[0] as $emails) {
					echo "<pre>";
					print_r($emails);	
					echo "<pre>";
				}
			}
		}
	}
}

class HttpCurl {
	protected $_cookie, $_parser, $_timeout;
	private $_ch, $_info, $_body, $_error;
	
	public function __construct($p = null) {
        if (!function_exists('curl_init')) {
            throw new Exception('cURL not enabled!');
        }	
		$this->setParser($p);
	}

	public function get($url) {	
		return $this->request($url);
	}

	protected function request($url) {
        $ch = curl_init($url);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
		curl_setopt($ch, CURLOPT_MAXREDIRS, 5);		
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
		curl_setopt($ch, CURLOPT_URL, $url);
		$this->_body = curl_exec($ch);
        $this->_info  = curl_getinfo($ch);
        $this->_error = curl_error($ch);
        curl_close($ch);

		$this->runParser($this->_body, $this->getStatus());				
    }

	public function getStatus() {
		return $this->_info[http_code];
	}
	
	public function getHeader() {
		return $this->_info;
	}

	public function getBody() {
		return $this->_body;
	}
	
	public function __destruct() {
	}	
	
	public function setParser($p)	{
		if ($p === null || $p instanceof HttpScraper || is_callable($p))	
			$this->_parser = $p;
	}

	public function runParser($content, $header)	{
		if ($this->_parser !== null)
		{
			if ($this->_parser instanceof HttpScraper)
				$this->_parser->parse($content, $header);	
			else
				call_user_func($this->_parser, $content, $header);
		}
	}	
}

?>

如何操作：

1. 首先定义一个正则表达式匹配电子邮件的格局。这模式将只在这个例子中使用。我们将在下一篇文章中改变模式。

define('EMAIL_PATTERN', '/([\s]*)([_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*([ ]+|)@([ ]+|)([a-zA-Z0-9-]+\.)+([a-zA-Z]{2,}))([\s]*)/i');

2. 然后定义称为HttpScraper的接口，内含一个公共方法parse().

interface HttpScraper
{
	public function parse($body, $head);
}

3. 下一步，我们创建一个类Scraper实现上述接口。有两个信息将传递给parse()的函数，一个是网页内容，另一个是该网页头部的HTTP_CODE。如果$head的值是200，我们使用功能preg_match_all在网页内容进行匹配电子邮件地址的格局(EMAIL_PATTERN)。在本教程中，我们在屏幕上打印结果。

class Scraper implements HttpScraper
{
	
	public function parse($body, $head)	{
	   if ($head == 200) {
		$p = preg_match_all(EMAIL_PATTERN, $body, $matches);
			if ($p) {
				foreach($matches[0] as $emails) {
					echo "<pre>";
					print_r($emails);	
					echo "<pre>";
				}
			}
		}
	}
}

4. 这里可看到HttpCurl被修改的部份。

class HttpCurl {
	protected $_cookie, $_parser, $_timeout;
	private $_ch, $_info, $_body, $_error;
	
	public function __construct($p = null) {
        if (!function_exists('curl_init')) {
            throw new Exception('cURL not enabled!');
        }	
		$this->setParser($p);
	}

	public function get($url) {	
		return $this->request($url);
	}

	protected function request($url) {
        $ch = curl_init($url);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
		curl_setopt($ch, CURLOPT_MAXREDIRS, 5);		
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
		curl_setopt($ch, CURLOPT_URL, $url);
		$this->_body = curl_exec($ch);
        $this->_info  = curl_getinfo($ch);
        $this->_error = curl_error($ch);
        curl_close($ch);

		$this->runParser($this->_body, $this->getStatus());				
    }

	public function getStatus() {
		return $this->_info[http_code];
	}
	
	public function getHeader() {
		return $this->_info;
	}

	public function getBody() {
		return $this->_body;
	}
	
	public function __destruct() {
	}	
	
	public function setParser($p)	{
		if ($p === null || $p instanceof HttpScraper || is_callable($p))	
			$this->_parser = $p;
	}

	public function runParser($content, $header)	{
		if ($this->_parser !== null)
		{
			if ($this->_parser instanceof HttpScraper)
				$this->_parser->parse($content, $header);	
			else
				call_user_func($this->_parser, $content, $header);
		}
	}	
}

5. 我们在HttpCurl的建構子加了成員函數setParser()。这函數把传递至$p的对象或回调函数存储在$_parser.

	public function setParser($p)	{
		if ($p === null || $p instanceof HttpScraper || is_callable($p))	
			$this->_parser = $p;
	}

6. 我们在类HttpCurl加了函數runParser()。runParser()执行$_parser里的对象或回调函数。我在这文章里使用对象。

	public function runParser($content, $header)	{
		if ($this->_parser !== null)
		{
			if ($this->_parser instanceof HttpScraper)
				$this->_parser->parse($content, $header);	
			else
				call_user_func($this->_parser, $content, $header);
		}
	}

7. runParser()函數在 request()的cURL请求网页源文件后才执行。

	protected function request($url) {
        $ch = curl_init($url);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
		curl_setopt($ch, CURLOPT_MAXREDIRS, 5);		
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
		curl_setopt($ch, CURLOPT_URL, $url);
		$this->_body = curl_exec($ch);
        $this->_info  = curl_getinfo($ch);
        $this->_error = curl_error($ch);
        curl_close($ch);

		$this->runParser($this->_body, $this->getStatus());				
    }

神兵降世，饮血开锋，现在我们可试试新脚本了。

如果您开发了产业网站，并已放在网上，您需要产业代理注册成会员上传他/她们的产业。要通过搜索引擎找到您的网站是非常困难的事。我们可通过别的网站找到他们的联络方法，采集后可发邮件邀请他/她们进入您的网站。以下有的简单例子。

提取产业代理的邮址

在网页源文件，就可看到ＨＴＭＬ代码。

网页源文件

现在修改我们的test.php

<? php
include 'httpcurl.php';
  
$target = "http://目标域名";
 
$up = new Scraper;
$test = new HttpCurl($up);
 
$test->get($target);
 
?>

执行后可看到脚本提取到电邮地址了。

执行后可看到脚本提取到电邮地址

先别开心，因为我们还没有提取到别的资料，如名字和电话号码。下一章我们只需稍微修改脚本就可达到目的。

最后修改于星期四, 03 11月 2016 06:33

给本项目评分

(0 得票数)

阅读 5897 次数

发布于网络内容采集

来源

PHP cURL和正则表达式的Email地址提取脚本(1)

相关项目

评语

最多点击

今天就使用 IPVanish VPN!