Utente:Leonard Vertighel/temp2
Aspetto
More unfinished stuff (see temp)
Decode encoded angle bracket at list($c, $r) = preg_split('|</pre>|', $tempp, 2);
<?
require_once('config.php');
require_once('functions.php');
$dbr = db_connect();
$dbw = db_connect(true);
echo "Retrieving pages from db...\n";
$db_query = 'select page_title, old_text from it_page inner join text on page_latest = old_id where page_namespace = 0 and page_is_redirect = 0';
$result = mysql_unbuffered_query($db_query, $dbr);
$counter = 0;
echo "Starting external link extraction...\n";
while ( $row = mysql_fetch_assoc($result) ) {
$query_parts = array();
$text = array();
$title = mysql_escape_string($row['page_title']);
// do NOT decode char refs, since MW1.6a doesn't for ext links
$old_text = $row['old_text'];
// separate commented parts
$temp = explode('<!--', $old_text);
$text['c'] = '';
$rest = $temp[0];
unset($temp[0]);
foreach ( $temp as $tempp ) {
list($c, $r) = explode('-->', $tempp, 2);
$text['c'] .= ' ' . $c;
$rest .= ' ' . $r;
}
// separate nowiki
$temp = preg_split('/<nowiki[^\n>]*>/iu', $rest);
$text['t'] = '';
$rest = $temp[0];
unset($temp[0]);
foreach ( $temp as $tempp ) {
list($c, $r) = preg_split('|</nowiki>|', $tempp, 2);
$text['t'] .= ' ' . $c;
$rest .= ' ' . $r;
}
// separate pre
$temp = preg_split('/<pre[^\n>]*>/iu', $rest);
$rest = $temp[0];
unset($temp[0]);
foreach ( $temp as $tempp ) {
list($c, $r) = preg_split('|</pre>|', $tempp, 2);
$text['t'] .= ' ' . $c;
$rest .= ' ' . $r;
}
// separate "external links" section (assume only one)
// capture '==' delimiter in order to split at next section of same or higher level
// (assuming starting and ending delimiter to be balanced)
$temp = preg_split('/(?:^|\n)(={2,6})\s*(?:collegamenti|collegamento|link|links|rinvio|rinvii)\s+(?:esterni|esterno)?\s*={2,6}/iu', $rest, 2, PREG_SPLIT_DELIM_CAPTURE);
$text['a'] = $temp[0];
$rest = $temp[2];
if ( $rest ) {
$hlevel = strlen($temp[1]);
$temp = preg_split("/(^|\\n)={2,$hlevel}[^=\\n]/", $rest, 2);
$text['s'] = $temp[0];
$text['a'] .= $temp[1];
}
// finished splitting, now extract links from each part
foreach ( $text as $type => $part ) {
// not entirely accurate, since we might chop off
// a trailing apostrophe. assume for now that
// this case does not occur
$part = preg_replace("/'{2,}/", ' ', $part);
// To obtain a *more or less* accurate result,
// try to "parse" the templates
if ( $type != 't' ) {
$temp = explode('{{', $part);
$part = $temp[0];
unset($temp[0]);
foreach ( $temp as $tempp ) {
if ( strpos($tempp, '}}') ) {
list($template, $rest) = explode('}}', $tempp, 2);
// for now we are just interested in getting rid
// of | attached to free links (just naively assuming
// them to be all valid separators, period)
$part .= ' ' . strtr($template, array('|' => ' ')) . ' ' . $rest;
} else {
// it seems we can't match this
// so we put it back to where it came from
$part .= '{{' . $tempp;
}
}
}
// for now we ignore malformed urls, including those
// accidentally attached to precending word
preg_match_all('/(?:(\[)|[^A-Za-z0-9]|^)(https?\:\/\/[^][<>"\s]+)([^]\n]*)(\]?)/', $part, $matches, PREG_SET_ORDER);
foreach ( $matches as $match ) {
// unset vars, since not all will always be set by extract
unset($secure, $user, $pass, $host, $port, $path, $query, $fragment, $d_user, $d_pass, $d_host, $d_path, $d_query, $d_fragment, $domain, $tld, $linktext);
list($url, $ltpart) = preg_split('/(?=&(lt|gt);)/', $match[2], 2);
if ( $match[1] && $match[4] ) {
// bracketed link
$linktext = mysql_escape_string(trim($ltpart . $match[3]));
if ( $linktext == '' ) {
// ] does never appear in link text, so we use
// it to mark bracketed links w/ no text
$linktext = ']';
}
} else {
// remove trailing punctuation
// (bracket cannot be at 0 position)
// note that MW1.6 cuts off trailing \
$regexp = (strpos($url, '(')) ? '/[,;\\\\.:!?]*$/' : '/[,;\\\\.:!?)]*$/' ;
$url = preg_replace($regexp, '', $url);
// empty linktext implies non-bracketed link
$linktext = '';
}
// split url into parts:
$url_parts = parse_url($url);
if ( $url_parts === false ) {
echo "Malformed URL '$url' found in $title\n";
continue;
}
extract(array_map(mysql_escape_string, $url_parts));
extract(array_map(mysql_escape_string, array_map(urldecode, $url_parts)), EXTR_PREFIX_ALL, 'd');
$secure = ( $scheme == 'http' ) ? 0 : 1;
// mark ip addresses by empty tld
if ( preg_match('/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/', $d_host) ) {
$tld = '';
$domain = $d_host;
} else {
$host_parts = explode('.', $d_host);
$tld = array_pop($host_parts);
$domain = array_pop($host_parts) . '.' . $tld;
}
if ( !$port ) {
$port = 80;
}
$d_query = strtr($d_query, array('&' => '&'));
$query_parts[] = "($secure, '$user', '$pass', '$host', $port, '$path', '$query', '$fragment', '$d_user', '$d_pass', '$d_host', '$d_path', '$d_query', '$d_fragment', '$domain', '$tld', '$type', '$linktext', '$title')";
}
}
// write urls to database
if ( $link_count = count($query_parts) ) {
$db_query = 'insert into extlinks (u_secure, u_user, u_pass, u_host, u_port, u_path, u_query, u_fragment, ud_user, ud_pass, ud_host, ud_path, ud_query, ud_fragment, ud_domain, ud_tld, t_type, t_link_text, t_page_title) values ' . implode($query_parts, ', ');
mysql_query($db_query, $dbw);
if ( $error = mysql_error($dbw) ) {
echo "Received MySQL error:\n\n$error\n\nThe query was:\n\n$db_query\n\n";
exit;
}
}
// page statistics
// exclude comments and link section from page lenght
$page_len = strlen($text['a']) + strlen($text['t']);
$link_dens = round($link_count * 1000 / $page_len);
$db_query = "insert into extl_stat (page_title, page_len, link_abs, link_rel) values ('$title', $page_len, $link_count, $link_dens)";
mysql_query($db_query, $dbw);
if ( $error = mysql_error($dbw) ) {
echo "Received MySQL error:\n\n$error\n\nThe query was:\n\n$db_query\n\n";
exit;
}
$counter++;
if ( ($counter%1000) == 0 ) {
echo $counter, "\n";
}
}
?>
<?php //extlink-check
require_once('config.php');
require_once('functions.php');
$db = db_connect();
$start = $argv[1];
$limit = $argv[2];
$query = "select id, u_secure, u_host, u_port, u_path, u_query from extlinks where t_type = 'a' or t_type = 's' order by id limit $start, $limit";
$result = mysql_query($query, $db);
while ( $row = mysql_fetch_assoc($result) ) {
$id = $row['id'];
$u_secure = $row['u_secure'];
$u_host = $row['u_host'];
$u_port = $row['u_port'];
$u_path = $row['u_path'];
$u_query = $row['u_query'];
$url = $u_secure ? 'https://' : 'http://';
$url .= $u_host;
$url .= ( $u_port != 80 ) ? ':' . $u_port : '' ;
$url .= $u_path;
$url .= ( $u_query ) ? '?' . $u_query : '' ;
$url = strtr($url, array('&' => '&'));
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_NOBODY, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_TIMEOUT, 15);
curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
if ( $content_type ) {
list($content_type, $dummy) = explode(';', $content_type, 2);
} else {
$content_type = '';
}
if ( curl_errno($ch) ) {
$http_code = -1;
$content_type = '';
}
$time = time();
$content_type = mysql_escape_string($content_type);
$u_host = mysql_escape_string($u_host);
$u_path = mysql_escape_string($u_path);
$u_query = mysql_escape_string($u_query);
$query = "update extlinks set c_status = $http_code, c_time = $time, c_type = '$content_type' where id = $id";
mysql_query($query, $db);
}
?>
<?php // extlink-check-master
$limit = 715; // IMPORTANT: adapt to database size! (should really automate this)
for ( $i=0; $i < 100; $i++ ) {
$start = $i * $limit;
echo shell_exec("php /home/matteo/imago/extlink-check.php $start $limit > /dev/null 2>&1 &");
echo "$i started ($start, $limit)\n";
}
?>