HTTPクローラ

とりあえず整理したのでアップロード。

とりあえず整理したのでアップロード。

  • タグ:
  • タグはありません
<?php
$dummyCookie = array();
$redirect_count = 0;
//
$returns = array( "req_head" => array(), "res_head" => array() );
/*
$url : URL
$port :
$method : GET, POST, HEAD(GET)
$headers : (""=>"")3
$cc :
$post : POST(""=>"")
*/
function http($url, $port=80, $method="GET", $headers=null, $cc=null, $post=array(""))
{
global $dummyCookie;
global $returns;
global $redirect_count;
//URL
$URL = parse_url($url);
//
if (isset($URL['query'])) {
$URL['query'] = "?".$URL['query'];
} else {
$URL['query'] = "";
}
//80
$URL['port'] = $port;
if( !array_key_exists( "host", $URL ) )
{
$URL['host'] = "sp.mbga.jp";
$URL['path'] = "/".$URL['path'];
}
//
$request = $method." ".$URL['path'].$URL['query']." HTTP/1.1\r\n";
$request .= "Host: ".$URL['host']."\r\n";
//
if( $headers != null )
{
foreach( $headers as $k => $h )
{
$request .= $k.":".$h."\r\n";
}
}
//
if( $cc )
{
$cc = str_replace( "Cookie: ", "", $cc );
$sep = explode( ";", $cc );
//
foreach( $sep as $c )
{
$d = explode( "=", $c );
if( strcmp( trim($d[0]), "" ) )
{
$dummyCookie[trim($d[0])] = trim($d[1]);
}
}
}
//
if( $dummyCookie != null )
{
$request .= "Cookie: ";
foreach( $dummyCookie as $key => $cookie )
{
if( $cookie != null )
{
$request .= "{$key}={$cookie}; ";
}
}
$request .= "\r\n";
}
$request .= "x-msim-use: on\r\n";
//POSTURL
if (strtoupper($method) == "POST") {
while (list($name, $value) = each($post)) {
$POST[] = $name."=".urlencode($value);
}
$postdata = implode("&", $POST);
$request .= "Content-Type: application/x-www-form-urlencoded\r\n";
$request .= "Content-Length: ".strlen($postdata)."\r\n";
$request .= "\r\n";
$request .= $postdata;
} else {
$request .= "\r\n";
}
$request .= "\r\n\r\n";
//var_dump($dummyCookie);
//WEB
$err = "";
if( $port == 443 )
{
$fp = fsockopen("ssl://".$URL['host'], $URL['port']);
}else{
$fp = fsockopen($URL['host'], $URL['port']);
}
//
if (!$fp) {
die("ERROR\n");
}
//Timeout2
stream_set_timeout($fp, 2);
//
fwrite($fp, $request);
//
$response = "";
$Cnt=0;
$redirect = null;
while (!feof($fp))
{
$line = fgets($fp, 1024);
$response .= $line;
if( strpos($line, "Set-Cookie") !== false )
{
//
$line = str_replace( "Set-Cookie: ", "", $line );
$elements = explode( ";", $line );
//
$kvp = explode( "=", array_shift($elements) ); //VALUE
$key_name = $kvp[0];
$value = $kvp[1];
$expire=null;
$path=null;
$domain=null;
$secure=null;
$httponly=null;
foreach( $elements as $elm )
{
$kvp = explode( "=", $elm );
switch( $kvp[0] )
{
case "expire":
$expire = $kvp[1];
break;
case "path":
$path = $kvp[1];
break;
case "domain":
$domain = $kvp[1];
break;
case "secure":
$secure = $kvp[1];
break;
case "httponly":
$httponly = $kvp[1];
break;
}
}
$dummyCookie[$key_name] = $value; //( $key_name, $value, $expire, $path, $domain, $secure, $httponly );
}
if( strpos($line, "Location") !== false )
{
$redirect = trim(str_replace( "Location: ", "", $line ));
}
//if( stripos($line, "Content-Type") !== false ) break;
}
//
$DATA = explode("\r\n\r\n", $response, 2);
//
//echo "<!-- resuest headers \r\n {$request} -->";
$returns["req_head"][] = str_replace( "\r\n", "<BR>", $request );
//
//echo "<!-- response headers \r\n {$DATA[0]} -->";
$returns["res_head"][] = str_replace( "\r\n", "<BR>", $DATA[0] );
$redirect_count = 0;
//echo "<hr><hr>";
if( $redirect != null )
{
$redirect_count++;
if($redirect_count>2)
{
//exit
exit;
}
//echo "redirect to:".$redirect;
//
fclose($fp);
$fp = null;
$returns["body"][] = $DATA[1];
$DATA[1] = http($redirect, 80, "GET", "Referer: {$redirect}");
}
//
if($fp)
{
fclose($fp);
}
//
return $DATA[1];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
//
//
//
// HTML
//
//
////
//GETPOST
function GetParam( $name )
{
$ret = null;
if(isset($_GET[$name])) {
$ret = $_GET[$name];
}
if(isset($_POST[$name])) {
$ret = $_POST[$name];
}
return $ret;
}
//
$url = GetParam("url");
$port = GetParam("port");
$method = GetParam("mt");
$cookie = GetParam("c");
$postParam = GetParam("pp");
$url="";
//
if( $url === null )
{
echo "URL";
exit;
}
if( $port == null )
{
$port = 80;
}
if( $method === null )
{
$method = "GET";
}
//HTMLPOST
$params = array();
if( $postParam )
{
$ar = explode( ",", $postParam );
foreach( $ar as $a )
{
$d = explode( "::", $a );
$params[$d[0]] = $d[1];
}
}
//
//m○bage
$request_header = array();
//$request_header["Referer"] = "";
$request_header["User-Agent"] = "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8C148 Safari/6531.22.7";
$request_header["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
$request_header["Accept-Language"] = "ja,en-us;q=0.7,en;q=0.3";
$request_header["Accept-Encoding"] = "gzip,deflate";
$request_header["Accept-Charset"] = "Shift_JIS,utf-8;q=0.7,*;q=0.7";
$request_header["Keep-Alive"] = "115";
$request_header["Connection"] = "keep-alive";
/*
//Basic
if ($user && $pass) {
$request_header["Authorization"] = "Basic ".base64_encode($user.":".$pass);
}
*/
//Web
//$body = http( $url, $port, $method, $request_header, $cookie, $params );
$body = http( "https://www.google.co.jp/", 80, "GET", $request_header, null, null );
//()
//echo $body;
//////
//$body
//////
//
//
/*
$counts = count($returns["req_head"]);
for( $i=0; $i<$counts; $i++ )
{
echo "<br />";
echo $returns["req_head"][$i];
echo "<hr>";
echo "<br />";
echo $returns["res_head"][$i];
echo "<hr>";
}
*/
//
/*
$fp = fopen( "http.txt", "w" );
if( $fp )
{
fwrite($fp,$body,strlen($body));
fclose($fp);
}
*/
//HTML
/*
//HTML
$cookies = "";
if( $dummyCookie != null )
{
$cookies .= "Cookie: ";
foreach( $dummyCookie as $key => $cookie )
{
if( $cookie != null )
{
$cookies .= "{$key}={$cookie}; ";
}
}
$cookies .= "\r\n";
}
$c_enc = urlencode($cookies);
$c_enc = str_replace( "+", "%20", $c_enc );
*/
/*
echo "<script type='text/javascript'>";
echo "req_callback(\"".$c_enc."\")";
echo "</script>";
*/
?>
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX