Utente:Leonard Vertighel/temp
Aspetto
Unfinished anchor link check script.
Released under GPL.
(Contains code from MediaWiki 1.5)
BUG: Add boilerplate text... ;)
Tables:
create table if not exists sl_links (
sll_from varchar(255),
sll_to varchar(255),
sll_sec varchar(255)
);
create table if not exists sl_targets (
slt_title varchar(255),
slt_sec varchar(255),
index(slt_title, slt_sec)
);
<?php //config.php
$config['mysql_host'] = '127.0.0.1';
$config['mysql_user'] = 'root';
$config['mysql_pass'] = '';
$config['mysql_db'] = 'wikipedia';
?>
<?php //functions.php
// database
function db_connect($new = false) {
global $config;
$db = mysql_connect($config['mysql_host'], $config['mysql_user'], $config['mysql_pass'], $new);
mysql_select_db($config['mysql_db'], $db);
return $db;
}
// functions from MediaWiki 1.5
function MW_decodeCharReferences( $text ) {
return preg_replace_callback(
'/&([A-Za-z0-9]+);|&\#([0-9]+);|&\#x([0-9A-Za-z]+);|&\#X([0-9A-Za-z]+);|(&)/x',
'MW_decodeCharReferencesCallback',
$text );
}
function MW_decodeCharReferencesCallback( $matches ) {
if( $matches[1] != '' ) {
return MW_decodeEntity( $matches[1] );
} elseif( $matches[2] != '' ) {
return MW_decodeChar( intval( $matches[2] ) );
} elseif( $matches[3] != '' ) {
return MW_decodeChar( hexdec( $matches[3] ) );
} elseif( $matches[4] != '' ) {
return MW_decodeChar( hexdec( $matches[4] ) );
}
# Last case should be an ampersand by itself
return $matches[0];
}
function MW_decodeEntity( $name ) {
$wgHtmlEntities = array(
'Aacute' => 193,
'aacute' => 225,
'Acirc' => 194,
'acirc' => 226,
'acute' => 180,
'AElig' => 198,
'aelig' => 230,
'Agrave' => 192,
'agrave' => 224,
'alefsym' => 8501,
'Alpha' => 913,
'alpha' => 945,
'amp' => 38,
'and' => 8743,
'ang' => 8736,
'Aring' => 197,
'aring' => 229,
'asymp' => 8776,
'Atilde' => 195,
'atilde' => 227,
'Auml' => 196,
'auml' => 228,
'bdquo' => 8222,
'Beta' => 914,
'beta' => 946,
'brvbar' => 166,
'bull' => 8226,
'cap' => 8745,
'Ccedil' => 199,
'ccedil' => 231,
'cedil' => 184,
'cent' => 162,
'Chi' => 935,
'chi' => 967,
'circ' => 710,
'clubs' => 9827,
'cong' => 8773,
'copy' => 169,
'crarr' => 8629,
'cup' => 8746,
'curren' => 164,
'dagger' => 8224,
'Dagger' => 8225,
'darr' => 8595,
'dArr' => 8659,
'deg' => 176,
'Delta' => 916,
'delta' => 948,
'diams' => 9830,
'divide' => 247,
'Eacute' => 201,
'eacute' => 233,
'Ecirc' => 202,
'ecirc' => 234,
'Egrave' => 200,
'egrave' => 232,
'empty' => 8709,
'emsp' => 8195,
'ensp' => 8194,
'Epsilon' => 917,
'epsilon' => 949,
'equiv' => 8801,
'Eta' => 919,
'eta' => 951,
'ETH' => 208,
'eth' => 240,
'Euml' => 203,
'euml' => 235,
'euro' => 8364,
'exist' => 8707,
'fnof' => 402,
'forall' => 8704,
'frac12' => 189,
'frac14' => 188,
'frac34' => 190,
'frasl' => 8260,
'Gamma' => 915,
'gamma' => 947,
'ge' => 8805,
'gt' => 62,
'harr' => 8596,
'hArr' => 8660,
'hearts' => 9829,
'hellip' => 8230,
'Iacute' => 205,
'iacute' => 237,
'Icirc' => 206,
'icirc' => 238,
'iexcl' => 161,
'Igrave' => 204,
'igrave' => 236,
'image' => 8465,
'infin' => 8734,
'int' => 8747,
'Iota' => 921,
'iota' => 953,
'iquest' => 191,
'isin' => 8712,
'Iuml' => 207,
'iuml' => 239,
'Kappa' => 922,
'kappa' => 954,
'Lambda' => 923,
'lambda' => 955,
'lang' => 9001,
'laquo' => 171,
'larr' => 8592,
'lArr' => 8656,
'lceil' => 8968,
'ldquo' => 8220,
'le' => 8804,
'lfloor' => 8970,
'lowast' => 8727,
'loz' => 9674,
'lrm' => 8206,
'lsaquo' => 8249,
'lsquo' => 8216,
'lt' => 60,
'macr' => 175,
'mdash' => 8212,
'micro' => 181,
'middot' => 183,
'minus' => 8722,
'Mu' => 924,
'mu' => 956,
'nabla' => 8711,
'nbsp' => 160,
'ndash' => 8211,
'ne' => 8800,
'ni' => 8715,
'not' => 172,
'notin' => 8713,
'nsub' => 8836,
'Ntilde' => 209,
'ntilde' => 241,
'Nu' => 925,
'nu' => 957,
'Oacute' => 211,
'oacute' => 243,
'Ocirc' => 212,
'ocirc' => 244,
'OElig' => 338,
'oelig' => 339,
'Ograve' => 210,
'ograve' => 242,
'oline' => 8254,
'Omega' => 937,
'omega' => 969,
'Omicron' => 927,
'omicron' => 959,
'oplus' => 8853,
'or' => 8744,
'ordf' => 170,
'ordm' => 186,
'Oslash' => 216,
'oslash' => 248,
'Otilde' => 213,
'otilde' => 245,
'otimes' => 8855,
'Ouml' => 214,
'ouml' => 246,
'para' => 182,
'part' => 8706,
'permil' => 8240,
'perp' => 8869,
'Phi' => 934,
'phi' => 966,
'Pi' => 928,
'pi' => 960,
'piv' => 982,
'plusmn' => 177,
'pound' => 163,
'prime' => 8242,
'Prime' => 8243,
'prod' => 8719,
'prop' => 8733,
'Psi' => 936,
'psi' => 968,
'quot' => 34,
'radic' => 8730,
'rang' => 9002,
'raquo' => 187,
'rarr' => 8594,
'rArr' => 8658,
'rceil' => 8969,
'rdquo' => 8221,
'real' => 8476,
'reg' => 174,
'rfloor' => 8971,
'Rho' => 929,
'rho' => 961,
'rlm' => 8207,
'rsaquo' => 8250,
'rsquo' => 8217,
'sbquo' => 8218,
'Scaron' => 352,
'scaron' => 353,
'sdot' => 8901,
'sect' => 167,
'shy' => 173,
'Sigma' => 931,
'sigma' => 963,
'sigmaf' => 962,
'sim' => 8764,
'spades' => 9824,
'sub' => 8834,
'sube' => 8838,
'sum' => 8721,
'sup' => 8835,
'sup1' => 185,
'sup2' => 178,
'sup3' => 179,
'supe' => 8839,
'szlig' => 223,
'Tau' => 932,
'tau' => 964,
'there4' => 8756,
'Theta' => 920,
'theta' => 952,
'thetasym' => 977,
'thinsp' => 8201,
'THORN' => 222,
'thorn' => 254,
'tilde' => 732,
'times' => 215,
'trade' => 8482,
'Uacute' => 218,
'uacute' => 250,
'uarr' => 8593,
'uArr' => 8657,
'Ucirc' => 219,
'ucirc' => 251,
'Ugrave' => 217,
'ugrave' => 249,
'uml' => 168,
'upsih' => 978,
'Upsilon' => 933,
'upsilon' => 965,
'Uuml' => 220,
'uuml' => 252,
'weierp' => 8472,
'Xi' => 926,
'xi' => 958,
'Yacute' => 221,
'yacute' => 253,
'yen' => 165,
'Yuml' => 376,
'yuml' => 255,
'Zeta' => 918,
'zeta' => 950,
'zwj' => 8205,
'zwnj' => 8204 );
if( isset( $wgHtmlEntities[$name] ) ) {
return MW_codepointToUtf8( $wgHtmlEntities[$name] );
} else {
return "&$name;";
}
}
function MW_decodeChar( $codepoint ) {
if( MW_validateCodepoint( $codepoint ) ) {
return MW_codepointToUtf8( $codepoint );
} else {
return ''; # ok forget about UTF8_REPLACEMENT; for now
}
}
function MW_validateCodepoint( $codepoint ) {
return ($codepoint == 0x09)
|| ($codepoint == 0x0a)
|| ($codepoint == 0x0d)
|| ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
|| ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
}
function MW_codepointToUtf8( $codepoint ) {
if($codepoint < 0x80) return chr($codepoint);
if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
chr($codepoint & 0x3f | 0x80);
if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
chr($codepoint >> 6 & 0x3f | 0x80) .
chr($codepoint & 0x3f | 0x80);
if($codepoint < 0x110000) return chr($codepoint >> 18 & 0x07 | 0xf0) .
chr($codepoint >> 12 & 0x3f | 0x80) .
chr($codepoint >> 6 & 0x3f | 0x80) .
chr($codepoint & 0x3f | 0x80);
die("Asked for code outside of range ($codepoint)\n");
}
// end functions from MediaWiki
?>
<?php
//Warning: truncate the sl_* tables first
//BUG: will output mysql error if query has no values
//BUG: urlencode()d links should be decoded
//BUG: nedd to take multiple equal section titles into account
//NOTE: Redirs are excluded in theory, but some are not marked as such in dump
// (might be useful to take them into account as well,
// so they will be correct if they will be supported in future)
//We are not taking into account:
//links in templates
//sections in templates
//templates in section titles
//Other problems:
//Markup in section titles, in particular math
//(simple markup like links, italics etc. is ok)
require_once('config.php');
require_once('functions.php');
$dbr = db_connect();
$dbw = db_connect(true);
// for now, we are just concerned with main namespace,
// so we will ignore all links with namespace or interwiki prefix
// prefixes are NOT case sensitive
$prefixes = array(
'media',
'speciale',
'special',
'discussione',
'talk',
'utente',
'user',
'discussioni_utente',
'user_talk',
'wikipedia',
'discussioni_wikipedia',
'wikipedia_talk',
'immagine',
'image',
'discussioni_immagine',
'image_talk',
'mediawiki',
'discussioni_mediawiki',
'mediawiki_talk',
'template',
'discussioni_template',
'template_talk',
'aiuto',
'help',
'discussioni_aiuto',
'help_talk',
'categoria',
'category',
'discussioni_categoria',
'category_talk',
'portale',
'portal',
'discussioni_portale',
'portal_talk');
$query = 'select iw_prefix from interwiki';
$result = mysql_query ($query, $dbr);
while ( $row = mysql_fetch_array($result) ) {
$prefixes[] = $row['iw_prefix'];
}
echo count($prefixes), " prefixes found.\n";
echo "Retrieving pages from db...\n";
$query = 'select page_title, old_text from page inner join text on page_latest = old_id where page_namespace = 0 and page_is_redirect = 0';
$result = mysql_unbuffered_query($query, $dbr);
$s_l = ' ';
$s_t = ' ';
$query_l = "insert into sl_links (sll_from, sll_to, sll_sec) values";
$query_t = "insert into sl_targets (slt_title, slt_sec) values";
$counter = 0;
echo "Starting section title/anchor and section link extraction...\n";
while ( $row = mysql_fetch_assoc($result) ) {
$title = mysql_escape_string($row['page_title']);
// remove html comments (what about nested comments?)
// decode references now to avoid trouble with &#num; entities
$tmp = explode('<!--', MW_decodeCharReferences($row['old_text']));
$text = $tmp[0];
unset($tmp[0]);
foreach ( $tmp as $tmp2 ) {
list($dummy, $tmp3) = explode('-->', $tmp2, 2);
$text .= $tmp3;
}
// sections (may occur at very beginning of page!)
preg_match_all('/(^|\n)={2,6}([^=\n]([^\n]*[^=\n])?)={2,6}/u', $text, $matches, PREG_SET_ORDER);
foreach ( $matches as $match ) {
// do NOT collapse whitespace (mimick behaviour of MediaWiki 1.6alpha)
$sec = strtr(trim($match[2]), ' ', '_');
// quick and dirty cleanup of most frequent wiki-markup etc.
// will produce incorrect results with math, templates, and some other cases
$sec = preg_replace("/'{2,3}/u", '', $sec); // italics, bold (might kill some apostrophe)
$sec = preg_replace('/\[\[[^]|\n]+\|([^]\n]+)\]\]/u', '$1', $sec); // wikilink w/ text
$sec = preg_replace('/\[\[|\]\]/u', '', $sec); // any further double square brackets (even non-matched)
$sec = preg_replace('|<[^>\n]+>|u', '', $sec); // anything that vaguely resembles an HTML tag
// this should be more or less the anchor name:
// (note that we cannot recover original text from this)
$sec = strtr(urlencode($sec), '%', '.');
$query_t .= $s_t . "('$title', '$sec')";
$s_t = ', ';
}
// anchors ('id' attributes of html elements - 'name' seems not accepted as of MW1.6alpha)
// (should probably be restricted to allowed elements)
// (maybe should check for uniqueness?)
// very rough:
preg_match_all('/<[a-z]+[^<>\n]*[\s^\n]+id[\s^\n]*=[\s^\n]*([^<>\n]*)>/u', $text, $matches, PREG_SET_ORDER);
foreach ( $matches as $match ) {
$tmp = $match[1];
$anc = '';
if ( strpos($tmp, '"') === 0 ) {
$tmp = mb_substr($tmp, 1);
if ( strpos($tmp, '"') ) {
list($anc, $dummy) = explode('"', $tmp, 2);
}
} elseif ( strpos($tmp, "'") === 0 ) {
$tmp = mb_substr($tmp, 1);
if ( strpos($tmp, "'") ) {
list($anc, $dummy) = explode("'", $tmp, 2);
}
} else {
preg_match('/^[A-Za-z0-9]+/u', $tmp, $a_match);
$anc = $a_match[0];
}
// do NOT collapse NOR trim whitespace (mimick behaviour of MediaWiki 1.6alpha)
$anc = strtr($anc, ' ', '_');
// this should be more or less the anchor name:
// (note that we cannot recover original text from this)
$anc = strtr(urlencode($anc), '%', '.');
if ( $anc ) {
$query_t .= $s_t . "('$title', '$anc')";
$s_t = ', ';
}
}
// links to sections (note that prefix must NOT contain any ':')
preg_match_all('/\[\[:?(([^]|\n:]+):)?([^]|\n]*)#([^]|\n]+)(\||\]\])/u', $text, $matches, PREG_SET_ORDER);
foreach ( $matches as $match ) {
// we are concerned only with internal links to main namespace:
$l_prefix = strtolower(preg_replace('/[\s_]+/u', '_', trim($match[2])));
if ( in_array($l_prefix, $prefixes) ) {
continue;
}
// not a valid prefix, so add it back to title:
$tmp = preg_replace('/[\s_]+/u', '_', trim($match[1] . $match[3])); // collapse whitespace and _
$l_title = mysql_escape_string(mb_strtoupper(mb_substr($tmp, 0, 1)) . mb_substr($tmp, 1));
if ( $l_title == '' ) {
$l_title = $title;
}
$l_sec = preg_replace('/[\s_]+/u', '_', trim($match[4])); // collapse whitespace and _
// this is just a guess, MediaWiki source should be analyzed to confirm/correct
if ( $l_sec != urldecode($l_sec) ) {
$l_sec = strtr($l_sec, '%', '.');
} else {
$l_sec = strtr(urlencode($l_sec), '%', '.');
}
$query_l .= $s_l . "('$title', '$l_title', '$l_sec')";
$s_l = ', ';
}
$counter++;
if ( ($counter%1000) == 0 ) {
echo $counter, "\n";
mysql_query($query_l, $dbw);
if ( mysql_errno($dbw) ) {
echo mysql_error($dbw), "\n";
}
mysql_query($query_t, $dbw);
if ( mysql_errno($dbw) ) {
echo mysql_error($dbw), "\n";
}
$s_l = ' ';
$s_t = ' ';
$query_l = "insert into sl_links (sll_from, sll_to, sll_sec) values";
$query_t = "insert into sl_targets (slt_title, slt_sec) values";
}
}
// leftover query (not an elegant solution...)
mysql_query($query_l, $dbw);
if ( mysql_errno($dbw) ) {
echo mysql_error($dbw), "\n";
}
mysql_query($query_t, $dbw);
if ( mysql_errno($dbw) ) {
echo mysql_error($dbw), "\n";
}
// and now for the redirects
echo "Loading redirects from db...\n";
$query = 'select page_title, old_text from page inner join text on page_latest = old_id where page_namespace = 0 and page_is_redirect = 1';
$result = mysql_unbuffered_query($query, $dbr);
$counter = 0;
echo "Creating redirect targets...\n";
while ( $row = mysql_fetch_assoc($result) ) {
$title = mysql_escape_string($row['page_title']);
$text = MW_decodeCharReferences($row['old_text']);
// we use essentially the same regexp as above,
//since prefixes and anchors etc. might exist here as well
//(but anchor is optional (and meaningless as of MW1.6alpha))
preg_match('/#\s*redirect\s*\[\[:?(([^]|\n:]+):)?([^]|\n]*)(#([^]|\n]+))?(\||\]\])/ui', $text, $match);
// we are concerned only with internal links to main namespace:
$l_prefix = strtolower(preg_replace('/[\s_]+/u', '_', trim($match[2])));
if ( in_array($l_prefix, $prefixes) ) {
continue;
}
// not a valid prefix, so add it back to title:
$tmp = preg_replace('/[\s_]+/u', '_', trim($match[1] . $match[3])); // collapse whitespace and _
$l_title = mysql_escape_string(mb_strtoupper(mb_substr($tmp, 0, 1)) . mb_substr($tmp, 1));
$query = "insert into sl_targets (slt_title, slt_sec) select '$title', slt_sec from sl_targets where slt_title = '$l_title'";
mysql_query($query, $dbw);
if ( mysql_errno($dbw) ) {
echo mysql_error($dbw), "\n";
}
$counter++;
if ( ($counter%1000) == 0 ) {
echo $counter, "\n";
}
}
?>
Queries:
General list:
Page-containing-link: link (number of occurrences in page)
select concat('#[[', sll_from, ']]: [[', sll_to, '#', sll_sec, ']] (', count(*), ')') from sl_links left join sl_targets on sll_to = slt_title and sll_sec = slt_sec where slt_sec is null group by sll_from, sll_to, sll_sec;
Star Wars planets (and analogous cases):
select concat('#[[Pianeti_di_Guerre_Stellari#', sll_sec, ']]: ', group_concat(concat('[[', sll_from, ']]') separator ', ')) from sl_links left join sl_targets on sll_to = slt_title and sll_sec = slt_sec where sll_to = 'Pianeti_di_Guerre_Stellari' and slt_sec is null group by sll_sec;