From Mike Nott: For crawling in PHP I have always used the fantastic cURL.
My curl single-threaded function: function singlethread_crawl($url) { $agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"; $ch = curl_init(); curl_setopt($ch, CURLOPT_NOSIGNAL, 1); curl_setopt($ch, CURLOPT_NOPROGRESS, 1); curl_setopt($ch, CURLOPT_FAILONERROR, 1); [...]
For crawling in PHP I have always used the fantastic cURL.
My curl single-threaded function:
function singlethread_crawl($url)
{
$agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
$ch = curl_init();
curl_setopt($ch, CURLOPT_NOSIGNAL, 1);
curl_setopt($ch, CURLOPT_NOPROGRESS, 1);
curl_setopt($ch, CURLOPT_FAILONERROR, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_MAXREDIRS, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$html = curl_exec($ch);
curl_close ($ch);
return $html;
}
My curl multi-threaded function:
function multithread_crawl($urls, $timeout, $verbose)
{
$agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
$mh = curl_multi_init();
foreach ($urls as $i => $url)
{
$conn[$i] = curl_init($url);
curl_setopt($conn[$i], CURLOPT_RETURNTRANSFER, 1);
curl_setopt($conn[$i], CURLOPT_NOSIGNAL, 1);
curl_setopt($conn[$i], CURLOPT_NOPROGRESS, 1);
curl_setopt($conn[$i], CURLOPT_FAILONERROR, 1);
curl_setopt($conn[$i], CURLOPT_URL, $url);
curl_setopt($conn[$i], CURLOPT_USERAGENT, $agent);
curl_setopt($conn[$i], CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($conn[$i], CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($conn[$i], CURLOPT_MAXREDIRS, 1);
curl_setopt($conn[$i], CURLOPT_TIMEOUT, $timeout);
curl_multi_add_handle ($mh, $conn[$i]);
}
do
{
$mrc = curl_multi_exec($mh, $active);
}
while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active and $mrc == CURLM_OK)
{
if (curl_multi_select($mh) != -1)
{
do
{
$mrc = curl_multi_exec($mh, $active);
}
while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
if ($mrc != CURLM_OK)
{
print "Curl multi read error $mrc\n";
}
$res = array();
$e = 0;
foreach ($urls as $i => $url)
{
if (($err = curl_error($conn[$i])) == '')
{
$res[$i]=curl_multi_getcontent($conn[$i]);
}
else
{
if ($verbose == "yes"){
echo "error: ".$url." (".$err.")\n";
}else{
$e++;
}
}
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
}
curl_multi_close($mh);
$s = count($urls)-$e;
if ($verbose == "no"){
echo "errors ".$e." | success ".$s."\n";
}
return $res;
}
However there are some annoyances in curl - the main one for me being that you can't pass variables to the write_function,
curl_setopt($conn[$i], CURLOPT_WRITEFUNCTION, myfunction);
which makes it useless for updating rows etc in a db (you can use curl_getinfo to get the url so do a lookup - but that is pretty backwards). This means that the crawling is not even close to being truely multithreaded as you have to wait for all urls to finish before working with the data.
So I thought I'd have a go at writing the raw crawler myself using fsockopen. Is not perfect as the multithread function does require the single thread one to follow any redirects.
My own single-threaded function:
function mycrawler_single($url, $timeout=10, $maxredirs=1)
{
$urlinfo = parse_url($url);
if (empty($urlinfo['scheme'])) {$urlinfo = parse_url('http://'.$url);}
if (empty($urlinfo["path"])) {$urlinfo["path"]="/";}
if (empty($urlinfo['port']))
{
switch($urlinfo['scheme'])
{
case "http":
$urlinfo['port'] = 80;
break;
case "https":
$urlinfo['port'] = 443;
break;
}
}
if (isset($urlinfo["query"]))
{
$request = "GET ".$urlinfo["path"]."?".$urlinfo["query"]." ";
} else {
$request = "GET ".$urlinfo["path"]." ";
}
$request .= "HTTP/1.0\r\n";
$request .= "Host: ".$urlinfo['host']."\r\n";
$request .= "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)\r\n";
$request .= "Connection: close\r\n\r\n";
$fp = fsockopen($urlinfo['host'], $urlinfo['port'], $errno, $errstr, $timeout);
if (!$fp)
{
echo "(".$errno.")".$errstr."\n";
}
else
{
fwrite($fp, $request);
while (!feof($fp))
{
$data .= fgets($fp, 4096);
}
fclose($fp);
$tmp = explode("\r\n\r\n", $data, 2);
$urlinfo['header'] = $tmp[0];
$urlinfo['html'] = $tmp[1];
if ((stripos($urlinfo['header'], "location:")) && ($maxredirs > 0))
{
preg_match("/\r\nlocation:(.*)/i", $urlinfo['header'], $match);
if ($match)
{
$redirect = trim($match[1]);
echo "Redirecting to ".$redirect."\n";
$maxredirs--;
return mycrawler_single($redirect, $timeout, $maxredirs);
}
}
return $urlinfo;
}
}
My own multi-threaded function:
function mycrawler_multi($urls, $timeout=10, $maxredirects=1)
{
for ($i=0; $i<count($urls); $i++)
{
$urlinfo[$i] = parse_url($urls[$i]);
$maxredirs[$i] = $maxredirects;
if (empty($urlinfo[$i]['scheme'])) {$urlinfo[$i] = parse_url('http://'.$url);}
if (empty($urlinfo[$i]["path"])) {$urlinfo[$i]["path"]="/";}
if (empty($urlinfo[$i]['port']))
{
switch($urlinfo[$i]['scheme'])
{
case "http":
$urlinfo[$i]['port'] = 80;
break;
case "https":
$urlinfo[$i]['port'] = 443;
break;
}
}
if (isset($urlinfo[$i]["query"]))
{
$request[$i] = "GET ".$urlinfo[$i]["path"]."?".$urlinfo[$i]["query"]." ";
} else {
$request[$i] = "GET ".$urlinfo[$i]["path"]." ";
}
$request[$i] .= "HTTP/1.0\r\n";
$request[$i] .= "Host: ".$urlinfo[$i]['host']."\r\n";
$request[$i] .= "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)\r\n";
$request[$i] .= "Connection: close\r\n\r\n";
$fp[$i] = fsockopen($urlinfo[$i]['host'], $urlinfo[$i]['port'], $urlinfo[$i]['errno'], $urlinfo[$i]['errstr'], $timeout);
socket_set_blocking($fp[$i], false);
if (!$fp[$i])
{
echo "(".$urlinfo[$i]['errno'].")".$urlinfo[$i]['errstr']."\n";
}
else
{
fwrite($fp[$i], $request[$i]);
}
}
$done = false;
$numdone = array();
while (!$done)
{
for ($i=0; $i<count($urls); $i++)
{
if (!feof($fp[$i]))
{
$data[$i] .= fgets($fp[$i], 4096);
}
elseif (empty($numdone[$i]))
{
$numdone[$i] = 1;
$tmp[$i] = explode("\r\n\r\n", $data[$i], 2);
$urlinfo[$i]['header'] = $tmp[$i][0];
$urlinfo[$i]['html'] = $tmp[$i][1];
if ((stripos($urlinfo[$i]['header'], "location:")) && ($maxredirs[$i] > 0))
{
preg_match("/\r\nlocation:(.*)/i", $urlinfo[$i]['header'], $match[$i]);
if ($match[$i])
{
$redirect[$i] = trim($match[$i][1]);
echo "Redirecting to ".$redirect[$i]."\n";
$maxredirs[$i]--;
&n