= 400, and pages with a *, content_type != "text/*" by default.) * * For each chosen page: * * 3. [Check robots.txt] If the chosen page is NOT "/robots.txt": * a. Retrieve the "/robots.txt" page record from the database for the * site of the chosen page. (Recursive call here.) * b. If the record does not exist, or the page has never been indexed, * or was last indexed over $robots_txt_expiry days ago, then * re-index it. * c. Check robots.txt for an exclusion rule for $user_agent and the * name of the chosen page. * d. If the robot is not allowed, then set the no_robots flag and stop * indexing the page. * * 4. [Retrieve] Download the page from the server, and store in the $page * array. * * 5. [Check response] Check for special cases of response. * a. If the connection failed, then set status = 0 and stop indexing. * b. If the response status was >= 400, then update the status field * and stop indexing. * c. If the response included a Location header, then add a link to * database for it, update status, and stop indexing. * d. If the content_type was not "text/*", then stop indexing. * * 6. [Parse words] Build a list of words appearing in the page. (If the * page has a content type like "text/html" or "text/xml", then words * appearing in the structural part are ignored.) * * 7. [Parse links] Build a list of links appearing in the page. (Links * include and .) Also extract the title of the * page if it exists. * * 8. [Stop indexing] Update the database according to how far the * indexing process got. * a. Update the last_accessed field of search_hosts. * b. Update the search_pages table with all the links found. * c. Update the search_links table with the link ids from (b). * d. Update the search_words and search_words_pages tables. * e. Update the chosen page in search_pages. * * 9. [Next page] Start again from step 3 for the next chosen page. */ ignore_user_abort (true); ob_implicit_flush (true); header ('Content-Type: text/plain'); mysql_connect ('localhost', 'chrysophylax') or die (__LINE__ . ': ' . mysql_error()); mysql_select_db ('chrysophylax') or die (__LINE__ . ': ' . mysql_error()); /* Load search_priorities. */ $result = mysql_query ("SELECT word, priority FROM search_priorities") or die (__LINE__ . ': ' . mysql_error()); $priorities = array(); while ($row = mysql_fetch_array ($result)) { $priorities[$row['word']] = $row['priority']; } /* Load search_ignore. */ $result = mysql_query ("SELECT word FROM search_ignore") or die (__LINE__ . ': ' . mysql_error()); $ignore = array(); while ($row = mysql_fetch_array ($result)) { $ignore[$row['word']] = true; } /* * Choose pages. */ echo "\n\n\nChoosing up to $max_pages pages for indexing... "; set_time_limit (15 * 60); $choose_result = mysql_query ( "SELECT h.id AS host_id, h.name AS host, p.id AS page_id, p.name AS page FROM search_hosts AS h, search_pages AS p WHERE (h.id = p.host_id) AND (p.no_robots = 'N') AND (p.status IS NULL OR p.status BETWEEN 200 AND 399) AND (p.content_type IS NULL OR p.content_type LIKE 'text/%') AND (h.broken = 'N') and lower(p.name) not regexp '\.(mp3|gz|pdf|zip|ps|jpg|gif|jpeg|png|iso|jpe|exe)\$' ORDER BY p.last_updated, IF(h.last_accessed > DATE_SUB(NOW(), INTERVAL $host_access_expiry DAY) AND h.id <> 1, h.last_accessed, NULL), p.priority DESC LIMIT $max_pages") or die (__LINE__ . ': ' . mysql_error()); echo "selected " . mysql_num_rows ($choose_result) . " pages.\n\n"; $choice_count = mysql_num_rows($choose_result); $choice_pos = 0; while ($choice = mysql_fetch_array ($choose_result)) { $choice_pos++; // if ($choice['host_id'] != @$last_host_id) { set_time_limit (15 * 60); index_page ($choice); echo "\n"; sleep ($sleep_time); } if ($choice['host_id'] != 1) $last_host_id = $choice['host_id']; } if ($sleep_time_final > $sleep_time) sleep ($sleep_time_final - $sleep_time); exit; function index_page ($choice) { global $robots_txt_expiry; global $host_access_expiry; global $priorities; global $ignore; global $choice_count, $choice_pos; static $indent = 0; $indent++; $words1 = array(); /* Content words */ $words2 = array(); /* Structure words */ $links = array(); /* Links to other pages */ $no_robots = false; $page = array(); echo str_repeat(' ', $indent) . "($choice_pos/$choice_count) [{$choice['host_id']} {$choice['page_id']}] {$choice['host']}{$choice['page']}\n"; /* * Check robots.txt */ if ($choice['page'] != '/robots.txt' and $choice['page'] != '/') { $result = mysql_query ( "SELECT h.id AS host_id, h.name AS host, p.id AS page_id, p.name AS page, p.content AS content, p.status AS status, IF(p.last_updated < DATE_SUB(NOW(), INTERVAL $robots_txt_expiry DAY),1,0) AS expired FROM search_hosts AS h, search_pages AS p WHERE (h.id = p.host_id) AND (h.id = {$choice['host_id']}) AND (p.name = '/robots.txt') LIMIT 1") or die (__LINE__ . ': ' . mysql_error()); if (mysql_num_rows ($result) == 1) { $robots_choice = mysql_fetch_array ($result); } else { mysql_query ("INSERT INTO search_pages (host_id,name,referrer) VALUES ({$choice['host_id']}, '/robots.txt', {$choice['page_id']})") or die (__LINE__ . ': ' . mysql_error()); $robots_choice = array( 'host_id' => $choice['host_id'], 'host' => $choice['host'], 'page_id' => mysql_insert_id(), 'page' => '/robots.txt', 'expired' => 1); } if ($robots_choice['expired']) { $robots_txt = index_page ($robots_choice); } else { $robots_txt = $robots_choice; } if ($robots_txt['status'] = 200 and $robots_txt['content_type'] = 'text/plain' and !robot_allowed ($robots_txt['content'], $choice['page'])) { $no_robots = true; } } /* * Retrieve and Check response. */ if (!$no_robots) { echo str_repeat(' ', $indent) . " Retrieving "; $page = http_get ($choice['host'] . $choice['page']); if ($page['status'] != 0) echo ' (' . strlen ($page['headers']) . '+' . strlen ($page['content']) . ' bytes)'; echo "\n"; if (isset ($page['Location'])) { add_link ($page['Location'], $choice['host'] . $choice['page'], $links); } } /* * Parse words. */ if (!$no_robots and $page['status'] >= 200 and $page['status'] < 300 and eregi ('^text/', $page['content_type'])) { if (ereg ('^text/.+ml', $page['content_type'])) { $tok = strtok ('<>' . $page['content'], '<'); while ($tok !== false) { $p = strpos ($tok, '>'); if ($p === false) { $s1 = ''; $s2 = $tok; } else if (eregi ('^(script|style)[ \r\n\t>]', $tok)) { $s1 = $tok; $s2 = ''; } else { $s1 = substr ($tok, 0, $p); $s2 = substr ($tok, $p+1); } parse_string ($s1, $words2); parse_string ($s2, $words1); $tok = strtok ('<'); } echo str_repeat(' ', $indent) . " Structured text parsed (" . count($words1) . '+' . count($words2). " words)\n"; } else { parse_string ($page['content'], $words1); echo str_repeat(' ', $indent) . " Plain text parsed (" . count($words1) . " words)\n"; } /* * Merge the two words tables into just words1. */ /* while (list($a,$b) = each ($words2)) { $words1[$a] = (@$words1[$a] + 0) . ',' . $b; } while (list($a,$b) = each ($words1)) { if (!ereg ('\,', $b)) { $words1[$a] .= ',0'; } } */ } /* * Parse links. */ if (!$no_robots and $page['status'] >= 200 and $page['status'] < 300 and eregi ('^text/.+ml', $page['content_type'])) { $base = $choice['host'] . $choice['page']; $s = strtok ($page['content'], '<'); while ($s !== false) { $p = strpos ($s, '>'); if ($p != false) $s2 = substr ($s, 0, $p); else $s2 = $s; $a = split ("[ \t\r\n]+", $s2 . ' dummy', 2); $a = $a[0]; $a = strtolower ($a); if ($a == 'title' and !isset ($page['title'])) { $page['title'] = substr ($s, $p+1); } $part = ($a == 'frame' or $a == 'img') ? 'src' : 'href'; $p = stristr ($s2, $part . '='); if ($p !== false) { $p = substr ($p, strlen ($part)+1); if (ereg ('^[=\'\"]', $p)) $p = substr($p, 1); $b = split ("[\'\" \t\r\n\]+", $p . ' x', 2); $b = $b[0]; switch ($a) { case 'a': case 'frame': case 'img': add_link ($b, $base, $links); break; case 'base': $base = $b; break; } } $s = strtok ('<'); } echo str_repeat(' ', $indent) . " Links parsed (" . count($links) . " links)\n"; } /* * Stop indexing. */ echo str_repeat(' ', $indent) . " Updating database "; /* Host information */ mysql_query ("UPDATE search_hosts SET last_accessed = NOW() WHERE id = " . $choice['host_id']) or die (__LINE__ . ': ' . mysql_error()); if (isset ($page['Server'])) { mysql_query ( "UPDATE search_hosts SET server = '" . addslashes (@$page['Server']) . "' WHERE id = {$choice['host_id']}") or die (__LINE__ . ': ' . mysql_error()); } if (isset ($page['address']) and $page['address'] != ':') { mysql_query ( "UPDATE search_hosts SET address = '" . addslashes ($page['address']) . "' WHERE id = {$choice['host_id']}") or die (__LINE__ . ': ' . mysql_error()); } /* Words */ $time_inc = 1; $total_priority = 0; mysql_query ("DELETE FROM search_words_pages WHERE page_id = '{$choice['page_id']}'") or die (__LINE__ . ': ' . mysql_error()); foreach ($words1 as $w => $c) { if (!@$ignore[$w]) { $total_priority += @$priorities[$w]; $result = mysql_query("SELECT id FROM search_words WHERE word = '" . addslashes ($w) . "'") or die (__LINE__ . ': ' . mysql_error()); $row = mysql_fetch_row($result); $wid = @$row[0]; if (!$wid) { mysql_query("INSERT INTO search_words (word) VALUES ('" . addslashes ($w) . "')") or die (__LINE__ . ': ' . mysql_error()); $wid = mysql_insert_id(); } mysql_query ("INSERT INTO search_words_pages (word_id,page_id,count) VALUES ('$wid','{$choice['page_id']}','$c')") or die (__LINE__ . ': ' . $w . ': ' . mysql_error()); if (time() - @$last_time > $time_inc) { echo '.'; $last_time = time(); $time_inc += 1; } } } if ($time_inc > 1) { echo ' '; } //echo '((total_priority = ' . $total_priority . '))'; /* Links */ $time_inc = 1; foreach ($links as $l => $c) { /* Divide into host name and page name. */ $h = substr ($l, 0, strpos ($l, '/')); $p = substr ($l, strpos ($l, '/')); /* Look up host id, or create new host record. */ $result = mysql_query ("SELECT id FROM search_hosts WHERE name = '" . addslashes ($h) . "' LIMIT 1") or die (__LINE__ . ': ' . mysql_error()); $row = mysql_fetch_row ($result); $hid = @$row[0]; if (!$hid) { mysql_query ("INSERT INTO search_hosts (name) VALUES (LOWER('" . addslashes ($h) . "'))") or die (__LINE__ . ': ' . mysql_error()); $hid = mysql_insert_id(); } /* Look up page id, or create a new page record. */ $result = mysql_query ("SELECT id FROM search_pages WHERE host_id = '$hid' AND name = '" . addslashes ($p) . "' LIMIT 1") or die (__LINE__ . ': ' . mysql_error()); $row = mysql_fetch_row ($result); $pid = @$row[0]; if (!$pid) { mysql_query ("INSERT INTO search_pages (host_id,name,referrer,priority) VALUES ('$hid','" . addslashes ($p) . "','{$choice['page_id']}','$total_priority')") or die (__LINE__ . ': ' . mysql_error()); $pid = mysql_insert_id(); } /* Insert row into search_links table. */ mysql_query ("REPLACE INTO search_links (from_id,to_id,strength) VALUES ({$choice['page_id']},$pid,$c)") or die (__LINE__ . ': ' . mysql_error()); if (time() - @$last_time > $time_inc) { echo '.'; $last_time = time(); $time_inc += 1; } } if ($time_inc > 1) { echo ' '; } /* Page information*/ if ($no_robots) { mysql_query ( "UPDATE search_pages SET no_robots = 'Y' WHERE id = " . $choice['page_id']) or die (__LINE__ . ': ' . mysql_error()); } else { if ($page['status'] != 0) { if (@$page['content'] != '') { $md5 = md5($page['content']); if ($choice['host_id'] != 1) { $con = "'" . addslashes ($page['content']) . "'"; } else { $con = 'NULL'; } mysql_query ( "UPDATE search_pages SET content = " . $con . ", content_md5 = '$md5', size = '" . strlen ($page['content']) . "' WHERE id = " . $choice['page_id']) or die (__LINE__ . ': ' . mysql_error()); } mysql_query ( "UPDATE search_pages SET headers = '" . addslashes ($page['headers']) . "' WHERE id = " . $choice['page_id']) or die (__LINE__ . ': ' . mysql_error()); } if (isset ($page['content_type'])) { mysql_query ( "UPDATE search_pages SET content_type = '" . addslashes ($page['content_type']) . "' WHERE id = " . $choice['page_id']) or die (__LINE__ . ': ' . mysql_error()); } if (isset ($page['title'])) { mysql_query ( "UPDATE search_pages SET title = '" . addslashes ($page['title']) . "' WHERE id = " . $choice['page_id']) or die (__LINE__ . ': ' . mysql_error()); } mysql_query ( "UPDATE search_pages SET last_updated = NOW(), status = '{$page['status']}' WHERE id = " . $choice['page_id']) or die (__LINE__ . ': ' . mysql_error()); } /* * Next page. */ echo "\n"; $indent--; return $page; } /* * Get the contents of a simple HTTP GET request. * Return value is an array: *, 'status_line' => status line of result. *, 'headers' => all headers in received form. *, 'content' => contents of result. *, 'content_type' => content type of the response. *, 'status' => numeric status code. *, '...' => all header fields. */ function http_get($url) { global $user_agent; global $from_email; global $referrer; $p1 = strpos($url, '/'); if ($p1 !== false) { $h = substr($url, 0,$p1); $p = substr($url, $p1); } else { $h = $url; $p = '/'; } /* Attempt to reuse the old database to save bandwidth. */ $result = mysql_query ( "SELECT id, data FROM old WHERE data IS NOT NULL AND host_name = '" . addslashes($h) . "' AND page_name = '" . addslashes($p) . "'") or die (__LINE__ . ': ' . mysql_error()); $row = mysql_fetch_array($result); if ($row) { $from_db = true; mysql_query("DELETE FROM old where id = '{$row['id']}'") or die(__LINE__ . ': ' . mysql_error()); $str = @$row['data']; $address = ''; $port = ''; echo "(from database)"; } else { $from_db = false; /* * Decode host and get IP address. */ $p1 = strpos($h, ':'); if ($p1 !== false) { $port = substr($h, $p1+1); $h = substr($h, 0, $p1-1); } else $port = 80; if (ereg('^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\$', $h)) { $address = $h; } else { $address = @gethostbyname($h); } //echo "[host = $h port = $port address = $address]\n"; if ($address == $h) { return array('status' => 0); } /* * Connect to remote host. */ if (($sock = @socket_create(AF_INET, SOCK_STREAM, 0)) < 0) { //echo "socket_create() failed: reason: " . socket_strerror ($sock) . "\n"; return array('status' => 0); } if (($ret = @socket_connect($sock, $address, $port)) < 0) { //echo "socket_listen() failed: reason: " . socket_strerror ($ret) . "\n"; return array('status' => 0); } $req = 'GET ' . $p . ' HTTP/1.0' . "\r\n"; $req .= 'User-Agent: ' . $user_agent . "\r\n"; $req .= 'From: ' . $from_email . "\r\n"; $req .= 'Referer: ' . $referrer . "\r\n"; $req .= 'Host: ' . $h . "\r\n"; $req .= 'Accept: text/*, application/x-httpd-php' . "\r\n"; $req .= 'Connection: close' . "\r\n"; $req .= "\r\n"; if (($ret = @socket_write($sock, $req, strlen($req)) < 0)) { //echo "socket_listen() failed: reason: " . socket_strerror ($ret) . "\n"; return array('status' => 0); } $str = ''; $time_inc = 1; do { set_time_limit(15 * 60); $s = @socket_read($sock, 8192); $str .= $s; if (time() - @$last_time > $time_inc) { echo '.'; $last_time = time(); $time_inc += 1; } } while ($s); socket_close($sock); } $ret = array(); $ret['status'] = 0; $ret['address'] = $address . ':' . $port; $p = 0; while (1) { $p2 = find_next_line($str, $p); $line = trim(substr($str, $p, $p2-$p), "\r\n"); $p = $p2; //echo '[[' . $line . ']]'; if ($line == '') { break; } if (ereg ('^HTTP/', $line)) { $ret['status_line'] = $line; $line = trim(substr($line, strpos($line, ' '))); $ret['status'] = $line + 0; } else { $q = strpos($line, ':'); if ($q) { $a = substr($line, 0, $q); $b = trim(substr($line, $q+1), ' '); $ret[$a] = $b; if (eregi('^Content-Type$', $a)) { if (ereg(';', $b)) $ret['content_type'] = substr($b, 0, strpos($b, ';')); else $ret['content_type'] = $b; } } else { //$ret[$line] = ''; } } } $ret['headers'] = substr($str, 0, $p); $ret['content'] = substr($str, $p); if (isset($ret['Content-Length']) && abs($ret['Content-Length'] - strlen($ret['content'])) > 2) { echo "\n" . 'NOTICE: Content corrupted? Content-Length = ' . $ret['Content-Length'] . ', strlen(content) = ' . strlen($ret['content']); if ($from_db) { return array('status' => 0); } else { exit; } } return $ret; } function find_next_line($str, $p) { while ($p < strlen($str) and $str{$p} != "\r" and $str{$p} != "\n") { $p++; } if ($p+1 < strlen($str) and ($str{$p+1} == "\r" or $str{$p+1} == "\n") and ($str{$p} != $str{$p+1})) { $p++; } $p++; return $p; } /* * Parse the contents of a robots.txt file and determine whether the * given url is permissable. */ function robot_allowed($robots_txt, $url) { global $user_agent; $u = true; $tok = strtok($robots_txt,"\n\r"); while ($tok) { if (eregi('^User-agent:', $tok)) { $ua = trim(substr($tok, 11)); if ($ua != '' and ($ua == '*' or stristr($user_agent, $ua))) $u = true; else $u = false; } else if (eregi('^Disallow:', $tok)) { $d = trim(substr($tok, 9)); if ($u and $d <> '' and strpos($url, $d) === 0) return false; } $tok = strtok("\n\r"); } return true; } function add_link($link, $base, &$links) { global $indent; $p = strpos($link, '#'); if ($p !== false) $link = substr($link, 0, $p); if ($link == '') return; $p = strpos($base, '?'); if ($p !== false) $base = substr($base, 0, $p); $base = substr($base, 0, strrpos($base, '/')+1); //echo '[[ ' . $link . " ]]\n"; if (eregi('^http://', $link) or ereg('^//', $link)) { $link = substr($link, strpos($link, '//')+2); } else if (eregi('^[a-z0-9-]+:', $link)) { /* Ignore other protocols for now. */ return; } else { if (ereg('^/', $link)) { $link = substr($base, 0, strpos($base, '/')) . $link; } else if (ereg ('^\?', $link)) { $p = strpos($base, '?'); if ($p !== false) $link = substr($base, 0, $p) . $link; else $link = $base . $link; } else { $link = $base . $link; } } /* * Reduce the bits where it says /../ or /./ */ $p = strpos($link, '/../'); while ($p !== false) { $p2 = strrpos (substr ($link, 0, $p), '/'); $link = substr ($link, 0, $p2) . substr ($link, $p+3); $p = strpos ($link, '/../'); } $p = strpos($link, '/./'); while ($p !== false) { $link = substr($link, 0, $p) . substr($link, $p+2); $p = strpos($link, '/./'); } $p = strpos($link, '?'); $q = strpos($link, '/'); //echo "p = $p, q = $q\n"; if (($p and $q and $p < $q) or ($p and !$q)) { //echo "fixed: $link"; $link = substr($link,0,$p) . '/' . substr($link,$p); //echo "-> $link\n"; } if (!ereg('/', $link)) $link .= '/'; echo str_repeat(' ', $indent) . ' LINK: ' . $link . "\n"; $links[$link] = @$links[$link] + 1; } /* * Parse a string for words and add them to the $words array. * * A word is a contiguous sequence of 3 or more alphanumeric symbols, * containing at least 2 alphabetic characters. */ function parse_string ($str, &$words) { $str = strtolower(strtr($str, 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÒÓÔÕÖÙÚÛÜàáâãäåæçèéêëìíîïòóôõöùúûü', 'AAAAAAECEEEEIIIIOOOOOUUUUaaaaaaeceeeeiiiiooooouuuu')); $p = 0; while (1) { while ($p < strlen ($str) and !is_alnum($str{$p})) { $p++; } if ($p >= strlen ($str)) return; $q = $p++; while ($p < strlen ($str) and (is_alnum($str{$p}))) { $p++; } $word = substr ($str, $q, $p-$q); if (strlen ($word) >= 3 and ereg ('[a-z].*[a-z]', $word)) { $words[$word] = @$words[$word] + 1; } } } function is_alnum($c) { return (($c >= 'a') and ($c <= 'z') or ($c >= 'A') and ($c <= 'Z') or ($c >= '0') and ($c <= '9')); } /* --+ get_old drop table x2; create table x2 select p.id from search_pages as p, search_hosts as h, old where old.host_name = h.name and old.page_name = p.name and h.id = p.host_id; */ ?>