Utente:Leonard Vertighel/temp

Da Wikipedia, l'enciclopedia libera.
Vai alla navigazione Vai alla ricerca

Unfinished anchor link check script.

Released under GPL.

(Contains code from MediaWiki 1.5)

BUG: Add boilerplate text... ;)


Tables:

create table if not exists sl_links (
	sll_from varchar(255),
	sll_to varchar(255),
	sll_sec varchar(255)
);

create table if not exists sl_targets (
	slt_title varchar(255),
	slt_sec varchar(255),
	index(slt_title, slt_sec)
);

<?php //config.php

$config['mysql_host'] = '127.0.0.1';
$config['mysql_user'] = 'root';
$config['mysql_pass'] = '';
$config['mysql_db'] = 'wikipedia';

?>

<?php //functions.php

// database

function db_connect($new = false) {
	global $config;
	$db = mysql_connect($config['mysql_host'], $config['mysql_user'], $config['mysql_pass'], $new);
	mysql_select_db($config['mysql_db'], $db);
	return $db;
}

// functions from MediaWiki 1.5

function MW_decodeCharReferences( $text ) {
	return preg_replace_callback(
		'/&([A-Za-z0-9]+);|&\#([0-9]+);|&\#x([0-9A-Za-z]+);|&\#X([0-9A-Za-z]+);|(&)/x',
		'MW_decodeCharReferencesCallback',
		$text );
}

function MW_decodeCharReferencesCallback( $matches ) {
	if( $matches[1] != '' ) {
		return MW_decodeEntity( $matches[1] );
	} elseif( $matches[2] != '' ) {
		return  MW_decodeChar( intval( $matches[2] ) );
	} elseif( $matches[3] != ''  ) {
		return  MW_decodeChar( hexdec( $matches[3] ) );
	} elseif( $matches[4] != '' ) {
		return  MW_decodeChar( hexdec( $matches[4] ) );
	}
	# Last case should be an ampersand by itself
	return $matches[0];
}

function MW_decodeEntity( $name ) {
	$wgHtmlEntities = array(
	'Aacute'   => 193,
	'aacute'   => 225,
	'Acirc'    => 194,
	'acirc'    => 226,
	'acute'    => 180,
	'AElig'    => 198,
	'aelig'    => 230,
	'Agrave'   => 192,
	'agrave'   => 224,
	'alefsym'  => 8501,
	'Alpha'    => 913,
	'alpha'    => 945,
	'amp'      => 38,
	'and'      => 8743,
	'ang'      => 8736,
	'Aring'    => 197,
	'aring'    => 229,
	'asymp'    => 8776,
	'Atilde'   => 195,
	'atilde'   => 227,
	'Auml'     => 196,
	'auml'     => 228,
	'bdquo'    => 8222,
	'Beta'     => 914,
	'beta'     => 946,
	'brvbar'   => 166,
	'bull'     => 8226,
	'cap'      => 8745,
	'Ccedil'   => 199,
	'ccedil'   => 231,
	'cedil'    => 184,
	'cent'     => 162,
	'Chi'      => 935,
	'chi'      => 967,
	'circ'     => 710,
	'clubs'    => 9827,
	'cong'     => 8773,
	'copy'     => 169,
	'crarr'    => 8629,
	'cup'      => 8746,
	'curren'   => 164,
	'dagger'   => 8224,
	'Dagger'   => 8225,
	'darr'     => 8595,
	'dArr'     => 8659,
	'deg'      => 176,
	'Delta'    => 916,
	'delta'    => 948,
	'diams'    => 9830,
	'divide'   => 247,
	'Eacute'   => 201,
	'eacute'   => 233,
	'Ecirc'    => 202,
	'ecirc'    => 234,
	'Egrave'   => 200,
	'egrave'   => 232,
	'empty'    => 8709,
	'emsp'     => 8195,
	'ensp'     => 8194,
	'Epsilon'  => 917,
	'epsilon'  => 949,
	'equiv'    => 8801,
	'Eta'      => 919,
	'eta'      => 951,
	'ETH'      => 208,
	'eth'      => 240,
	'Euml'     => 203,
	'euml'     => 235,
	'euro'     => 8364,
	'exist'    => 8707,
	'fnof'     => 402,
	'forall'   => 8704,
	'frac12'   => 189,
	'frac14'   => 188,
	'frac34'   => 190,
	'frasl'    => 8260,
	'Gamma'    => 915,
	'gamma'    => 947,
	'ge'       => 8805,
	'gt'       => 62,
	'harr'     => 8596,
	'hArr'     => 8660,
	'hearts'   => 9829,
	'hellip'   => 8230,
	'Iacute'   => 205,
	'iacute'   => 237,
	'Icirc'    => 206,
	'icirc'    => 238,
	'iexcl'    => 161,
	'Igrave'   => 204,
	'igrave'   => 236,
	'image'    => 8465,
	'infin'    => 8734,
	'int'      => 8747,
	'Iota'     => 921,
	'iota'     => 953,
	'iquest'   => 191,
	'isin'     => 8712,
	'Iuml'     => 207,
	'iuml'     => 239,
	'Kappa'    => 922,
	'kappa'    => 954,
	'Lambda'   => 923,
	'lambda'   => 955,
	'lang'     => 9001,
	'laquo'    => 171,
	'larr'     => 8592,
	'lArr'     => 8656,
	'lceil'    => 8968,
	'ldquo'    => 8220,
	'le'       => 8804,
	'lfloor'   => 8970,
	'lowast'   => 8727,
	'loz'      => 9674,
	'lrm'      => 8206,
	'lsaquo'   => 8249,
	'lsquo'    => 8216,
	'lt'       => 60,
	'macr'     => 175,
	'mdash'    => 8212,
	'micro'    => 181,
	'middot'   => 183,
	'minus'    => 8722,
	'Mu'       => 924,
	'mu'       => 956,
	'nabla'    => 8711,
	'nbsp'     => 160,
	'ndash'    => 8211,
	'ne'       => 8800,
	'ni'       => 8715,
	'not'      => 172,
	'notin'    => 8713,
	'nsub'     => 8836,
	'Ntilde'   => 209,
	'ntilde'   => 241,
	'Nu'       => 925,
	'nu'       => 957,
	'Oacute'   => 211,
	'oacute'   => 243,
	'Ocirc'    => 212,
	'ocirc'    => 244,
	'OElig'    => 338,
	'oelig'    => 339,
	'Ograve'   => 210,
	'ograve'   => 242,
	'oline'    => 8254,
	'Omega'    => 937,
	'omega'    => 969,
	'Omicron'  => 927,
	'omicron'  => 959,
	'oplus'    => 8853,
	'or'       => 8744,
	'ordf'     => 170,
	'ordm'     => 186,
	'Oslash'   => 216,
	'oslash'   => 248,
	'Otilde'   => 213,
	'otilde'   => 245,
	'otimes'   => 8855,
	'Ouml'     => 214,
	'ouml'     => 246,
	'para'     => 182,
	'part'     => 8706,
	'permil'   => 8240,
	'perp'     => 8869,
	'Phi'      => 934,
	'phi'      => 966,
	'Pi'       => 928,
	'pi'       => 960,
	'piv'      => 982,
	'plusmn'   => 177,
	'pound'    => 163,
	'prime'    => 8242,
	'Prime'    => 8243,
	'prod'     => 8719,
	'prop'     => 8733,
	'Psi'      => 936,
	'psi'      => 968,
	'quot'     => 34,
	'radic'    => 8730,
	'rang'     => 9002,
	'raquo'    => 187,
	'rarr'     => 8594,
	'rArr'     => 8658,
	'rceil'    => 8969,
	'rdquo'    => 8221,
	'real'     => 8476,
	'reg'      => 174,
	'rfloor'   => 8971,
	'Rho'      => 929,
	'rho'      => 961,
	'rlm'      => 8207,
	'rsaquo'   => 8250,
	'rsquo'    => 8217,
	'sbquo'    => 8218,
	'Scaron'   => 352,
	'scaron'   => 353,
	'sdot'     => 8901,
	'sect'     => 167,
	'shy'      => 173,
	'Sigma'    => 931,
	'sigma'    => 963,
	'sigmaf'   => 962,
	'sim'      => 8764,
	'spades'   => 9824,
	'sub'      => 8834,
	'sube'     => 8838,
	'sum'      => 8721,
	'sup'      => 8835,
	'sup1'     => 185,
	'sup2'     => 178,
	'sup3'     => 179,
	'supe'     => 8839,
	'szlig'    => 223,
	'Tau'      => 932,
	'tau'      => 964,
	'there4'   => 8756,
	'Theta'    => 920,
	'theta'    => 952,
	'thetasym' => 977,
	'thinsp'   => 8201,
	'THORN'    => 222,
	'thorn'    => 254,
	'tilde'    => 732,
	'times'    => 215,
	'trade'    => 8482,
	'Uacute'   => 218,
	'uacute'   => 250,
	'uarr'     => 8593,
	'uArr'     => 8657,
	'Ucirc'    => 219,
	'ucirc'    => 251,
	'Ugrave'   => 217,
	'ugrave'   => 249,
	'uml'      => 168,
	'upsih'    => 978,
	'Upsilon'  => 933,
	'upsilon'  => 965,
	'Uuml'     => 220,
	'uuml'     => 252,
	'weierp'   => 8472,
	'Xi'       => 926,
	'xi'       => 958,
	'Yacute'   => 221,
	'yacute'   => 253,
	'yen'      => 165,
	'Yuml'     => 376,
	'yuml'     => 255,
	'Zeta'     => 918,
	'zeta'     => 950,
	'zwj'      => 8205,
	'zwnj'     => 8204 );
	if( isset( $wgHtmlEntities[$name] ) ) {
		return MW_codepointToUtf8( $wgHtmlEntities[$name] );
	} else {
		return "&$name;";
	}
}

function MW_decodeChar( $codepoint ) {
	if( MW_validateCodepoint( $codepoint ) ) {
		return MW_codepointToUtf8( $codepoint );
	} else {
		return ''; # ok forget about UTF8_REPLACEMENT; for now
	}
}
	
function MW_validateCodepoint( $codepoint ) {
	return ($codepoint ==    0x09)
		|| ($codepoint ==    0x0a)
		|| ($codepoint ==    0x0d)
		|| ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
		|| ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
		|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
}
	
function MW_codepointToUtf8( $codepoint ) {
	if($codepoint <		0x80) return chr($codepoint);
	if($codepoint <    0x800) return chr($codepoint >>	6 & 0x3f | 0xc0) .
									 chr($codepoint		  & 0x3f | 0x80);
	if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
									 chr($codepoint >>	6 & 0x3f | 0x80) .
									 chr($codepoint		  & 0x3f | 0x80);
	if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
									 chr($codepoint >> 12 & 0x3f | 0x80) .
									 chr($codepoint >>	6 & 0x3f | 0x80) .
									 chr($codepoint		  & 0x3f | 0x80);

	die("Asked for code outside of range ($codepoint)\n");
}

// end functions from MediaWiki

?>

<?php

//Warning: truncate the sl_* tables first

//BUG: will output mysql error if query has no values
//BUG: urlencode()d links should be decoded
//BUG: nedd to take multiple equal section titles into account
//NOTE: Redirs are excluded in theory, but some are not marked as such in dump
//      (might be useful to take them into account as well,
//        so they will be correct if they will be supported in future)

//We are not taking into account:
//links in templates
//sections in templates
//templates in section titles

//Other problems:
//Markup in section titles, in particular math
//(simple markup like links, italics etc. is ok)

require_once('config.php');
require_once('functions.php');

$dbr = db_connect();
$dbw = db_connect(true);

// for now, we are just concerned with main namespace,
// so we will ignore all links with namespace or interwiki prefix
// prefixes are NOT case sensitive
$prefixes = array(
	'media',
	'speciale',
	'special',
	'discussione',
	'talk',
	'utente',
	'user',
	'discussioni_utente',
	'user_talk',
	'wikipedia',
	'discussioni_wikipedia',
	'wikipedia_talk',
	'immagine',
	'image',
	'discussioni_immagine',
	'image_talk',
	'mediawiki',
	'discussioni_mediawiki',
	'mediawiki_talk',
	'template',
	'discussioni_template',
	'template_talk',
	'aiuto',
	'help',
	'discussioni_aiuto',
	'help_talk',
	'categoria',
	'category',
	'discussioni_categoria',
	'category_talk',
	'portale',
	'portal',
	'discussioni_portale',
	'portal_talk');
$query = 'select iw_prefix from interwiki';
$result = mysql_query ($query, $dbr);
while ( $row = mysql_fetch_array($result) ) {
	$prefixes[] = $row['iw_prefix'];
}
echo count($prefixes), " prefixes found.\n";

echo "Retrieving pages from db...\n";
$query = 'select page_title, old_text from page inner join text on page_latest = old_id where page_namespace = 0 and page_is_redirect = 0';
$result = mysql_unbuffered_query($query, $dbr);
$s_l = ' ';
$s_t = ' ';
$query_l = "insert into sl_links (sll_from, sll_to, sll_sec) values";
$query_t = "insert into sl_targets (slt_title, slt_sec) values";
$counter = 0;
echo "Starting section title/anchor and section link extraction...\n";
while ( $row = mysql_fetch_assoc($result) ) {
	$title = mysql_escape_string($row['page_title']);
	// remove html comments (what about nested comments?)
	// decode references now to avoid trouble with &#num; entities
	$tmp = explode('<!--', MW_decodeCharReferences($row['old_text']));
	$text = $tmp[0];
	unset($tmp[0]);
	foreach ( $tmp as $tmp2 ) {
		list($dummy, $tmp3) = explode('-->', $tmp2, 2);
		$text .= $tmp3;
	}
	// sections (may occur at very beginning of page!)
	preg_match_all('/(^|\n)={2,6}([^=\n]([^\n]*[^=\n])?)={2,6}/u', $text, $matches, PREG_SET_ORDER);
	foreach ( $matches as $match ) {
		// do NOT collapse whitespace (mimick behaviour of MediaWiki 1.6alpha)
		$sec = strtr(trim($match[2]), ' ', '_');
		// quick and dirty cleanup of most frequent wiki-markup etc.
		// will produce incorrect results with math, templates, and some other cases
		$sec = preg_replace("/'{2,3}/u", '', $sec); // italics, bold (might kill some apostrophe)
		$sec = preg_replace('/\[\[[^]|\n]+\|([^]\n]+)\]\]/u', '$1', $sec); // wikilink w/ text
		$sec = preg_replace('/\[\[|\]\]/u', '', $sec); // any further double square brackets (even non-matched)
		$sec = preg_replace('|<[^>\n]+>|u', '', $sec); // anything that vaguely resembles an HTML tag
		// this should be more or less the anchor name:
		// (note that we cannot recover original text from this)
		$sec = strtr(urlencode($sec), '%', '.');
		$query_t .= $s_t . "('$title', '$sec')";
		$s_t = ', ';
	}
	// anchors ('id' attributes of html elements - 'name' seems not accepted as of MW1.6alpha)
	// (should probably be restricted to allowed elements)
	// (maybe should check for uniqueness?)
	// very rough:
	preg_match_all('/<[a-z]+[^<>\n]*[\s^\n]+id[\s^\n]*=[\s^\n]*([^<>\n]*)>/u', $text, $matches, PREG_SET_ORDER);
	foreach ( $matches as $match ) {
		$tmp = $match[1];
		$anc = '';
		if ( strpos($tmp, '"') === 0 ) {
			$tmp = mb_substr($tmp, 1);
			if ( strpos($tmp, '"') ) {
				list($anc, $dummy) = explode('"', $tmp, 2);
			}
		} elseif ( strpos($tmp, "'") === 0 ) {
			$tmp = mb_substr($tmp, 1);
			if ( strpos($tmp, "'") ) {
				list($anc, $dummy) = explode("'", $tmp, 2);
			}
		} else {
			preg_match('/^[A-Za-z0-9]+/u', $tmp, $a_match);
			$anc = $a_match[0];
		}
		// do NOT collapse NOR trim whitespace (mimick behaviour of MediaWiki 1.6alpha)
		$anc = strtr($anc, ' ', '_');
		// this should be more or less the anchor name:
		// (note that we cannot recover original text from this)
		$anc = strtr(urlencode($anc), '%', '.');
		if ( $anc ) {
			$query_t .= $s_t . "('$title', '$anc')";
			$s_t = ', ';
		}
	}
	// links to sections (note that prefix must NOT contain any ':')
	preg_match_all('/\[\[:?(([^]|\n:]+):)?([^]|\n]*)#([^]|\n]+)(\||\]\])/u', $text, $matches, PREG_SET_ORDER);
	foreach ( $matches as $match ) {
		// we are concerned only with internal links to main namespace:
		$l_prefix = strtolower(preg_replace('/[\s_]+/u', '_', trim($match[2])));
		if ( in_array($l_prefix, $prefixes) ) {
			continue;
		}
		// not a valid prefix, so add it back to title:
		$tmp = preg_replace('/[\s_]+/u', '_', trim($match[1] . $match[3])); // collapse whitespace and _
		$l_title = mysql_escape_string(mb_strtoupper(mb_substr($tmp, 0, 1)) . mb_substr($tmp, 1));
		if ( $l_title == '' ) {
			$l_title = $title;
		}
		$l_sec = preg_replace('/[\s_]+/u', '_', trim($match[4])); // collapse whitespace and _
		// this is just a guess, MediaWiki source should be analyzed to confirm/correct
		if ( $l_sec != urldecode($l_sec) ) {
			$l_sec = strtr($l_sec, '%', '.');
		} else {
			$l_sec = strtr(urlencode($l_sec), '%', '.');
		}
		$query_l .= $s_l . "('$title', '$l_title', '$l_sec')";
		$s_l = ', ';
	}
	$counter++;
	if ( ($counter%1000) == 0 ) {
		echo $counter, "\n";
		mysql_query($query_l, $dbw);
		if ( mysql_errno($dbw) ) {
			echo mysql_error($dbw), "\n";
		}
		mysql_query($query_t, $dbw);
		if ( mysql_errno($dbw) ) {
			echo mysql_error($dbw), "\n";
		}
		$s_l = ' ';
		$s_t = ' ';
		$query_l = "insert into sl_links (sll_from, sll_to, sll_sec) values";
		$query_t = "insert into sl_targets (slt_title, slt_sec) values";
	}
}

// leftover query (not an elegant solution...)
mysql_query($query_l, $dbw);
if ( mysql_errno($dbw) ) {
	echo mysql_error($dbw), "\n";
}
mysql_query($query_t, $dbw);
if ( mysql_errno($dbw) ) {
	echo mysql_error($dbw), "\n";
}

// and now for the redirects
echo "Loading redirects from db...\n";
$query = 'select page_title, old_text from page inner join text on page_latest = old_id where page_namespace = 0 and page_is_redirect = 1';
$result = mysql_unbuffered_query($query, $dbr);
$counter = 0;
echo "Creating redirect targets...\n";
while ( $row = mysql_fetch_assoc($result) ) {
	$title = mysql_escape_string($row['page_title']);
	$text = MW_decodeCharReferences($row['old_text']);
	// we use essentially the same regexp as above, 
	//since prefixes and anchors etc. might exist here as well 
	//(but anchor is optional (and meaningless as of MW1.6alpha))
	preg_match('/#\s*redirect\s*\[\[:?(([^]|\n:]+):)?([^]|\n]*)(#([^]|\n]+))?(\||\]\])/ui', $text, $match);
	// we are concerned only with internal links to main namespace:
	$l_prefix = strtolower(preg_replace('/[\s_]+/u', '_', trim($match[2])));
	if ( in_array($l_prefix, $prefixes) ) {
		continue;
	}
	// not a valid prefix, so add it back to title:
	$tmp = preg_replace('/[\s_]+/u', '_', trim($match[1] . $match[3])); // collapse whitespace and _
	$l_title = mysql_escape_string(mb_strtoupper(mb_substr($tmp, 0, 1)) . mb_substr($tmp, 1));
	$query = "insert into sl_targets (slt_title, slt_sec) select '$title', slt_sec from sl_targets where slt_title = '$l_title'";
	mysql_query($query, $dbw);
	if ( mysql_errno($dbw) ) {
		echo mysql_error($dbw), "\n";
	}
	$counter++;
	if ( ($counter%1000) == 0 ) {
		echo $counter, "\n";
	}
}
?>

Queries:

General list:
	Page-containing-link: link (number of occurrences in page)
select concat('#[[', sll_from, ']]: [[', sll_to, '#', sll_sec, ']] (', count(*), ')') from sl_links left join sl_targets on sll_to = slt_title and sll_sec = slt_sec where slt_sec is null group by sll_from, sll_to, sll_sec;

Star Wars planets (and analogous cases):
select concat('#[[Pianeti_di_Guerre_Stellari#', sll_sec, ']]: ', group_concat(concat('[[', sll_from, ']]') separator ', ')) from sl_links left join sl_targets on sll_to = slt_title and sll_sec = slt_sec where sll_to = 'Pianeti_di_Guerre_Stellari' and slt_sec is null group by sll_sec;