Utente:MaEr/Gerarchia delle sezioni

Da Wikipedia, l'enciclopedia libera.
Vai alla navigazione Vai alla ricerca
# "strict" is for developing; comment this out if it is not found:
use strict;

# this is tested with Perl 5 only (v5.8.6 from IndigoSTAR Software http://www.indigostar.com)
if ( $] < 5 )
	{
	die "Perl interpreter too old ($]) -- use Perl version 5!$/";
	}

# version 1, 09.02.2009: seems to work


# our manual:
my $usage = qq"
$0 - by MaEr

corrects the section hierarchy of an article by using its table of contents

1) copy table of contents from WP to file
2) copy the source code of the article to another file
3) run this program; check out for warnings and error messages
4) in WP: replace the source code of the article with the output of this program
5) in WP: check the differences (make a diff) and check the preview
6) in WP: save the changes

usage:
perl -W $0 OPTIONS 

OPTIONS are:
--toc=FILENAME		the program will read the table of contents from this file
--article-in=FILENAME	the program will read the article text from this file
--article-out=FILENAME	the program will write the article text to this file
--log=FILENAME		the log file
			(if no file name is defined, it will be '$0.log')
--help			displays this message

";

# without any arguments, we complain and exit:
if ( $#ARGV == -1 )
	{
	die $usage . $/;
	}

# collect the information we need:
# the toc input file:
my $toc;
# the article input file:
my $article_in;
# the article output file:
my $article_out;
# the log file (default: <program_name> + .log):
my $log = "$0.log";

# browse the command line arguments:
foreach my $arg ( @ARGV )
	{
	if ( $arg eq '--help' )
		{
		print $usage;
		exit;
		}
	
	# ARGNAME=ARGVALUE
	my ( $arg_name, $arg_value ) = split( /=/, $arg, 2);
	if ( defined $arg_name and $arg_name eq '--toc' and defined $arg_value )
		{
		$toc = $arg_value;
		}
	elsif ( defined $arg_name and $arg_name eq '--article-in' and defined $arg_value )
		{
		$article_in = $arg_value;
		}
	elsif ( defined $arg_name and $arg_name eq '--article-out' and defined $arg_value )
		{
		$article_out = $arg_value;
		}
	elsif ( defined $arg_name and $arg_name eq '--log' and defined $arg_value )
		{
		$log = $arg_value;
		}
	else
		{
		die "unknown argument: $arg_name$/";
		}
	}

# check if arguments are complete:
if ( not defined $toc )
	{
	print "$/toc input file is missing; use --toc$/$/";
	die "$usage$/";
	}
if ( not defined $article_in )
	{
	print "$/article input file is missing; use --article_in$/$/";
	die "$usage$/";
	}
if ( not defined $article_out )
	{
	print "$/article output file is missing; use --article_out$/$/";
	die "$usage$/";
	}

open ( LOG, "> $log" )
	or die "cannot write to log file $log: $!$/";

# read table of contents:
print LOG "opening toc input file $toc$/";
open ( TOC, "< $toc" )
	or die "cannot read from $toc: $!$/";

my @section_numbers;
my @section_titles;
while ( <TOC> )
	{
	my $line = $_;
	chomp $line;
	# skip empty lines:
	if ( $line eq '' )
		{
		next;
		}
	# line must start with optional bullet points and
	# with one or more digits separated by dots (.)
	# mozilla firefox 3.0 uses the following bullet points: * o + # *
	if ( $line =~ m/^\s*[\*o\+\#]?\s*((\d+)(\.\d+)*)\s+(.*)/ )
		{
		# extract section number:
		my $section_number = $1;
=pod
		my $section_number_1 = $1;
		if ( not defined $section_number_1 )
			{
			# error
			my $msg = "warning: cannot find section number in TOC line '$line'$/";
			print STDERR $msg;
			print LOG $msg;
			}
		my $section_number_2 = $2;
		if ( not defined $section_number_2 )
			{
			$section_number_2 = '';
			}
		my $section_number = $section_number_1 . $section_number_2;
=cut

		# extract section title:
		# my $section_title = $3;
		my $section_title = $4;
		if ( not defined $section_title )
			{
			# error
			my $msg = "warning: cannot find section title in TOC line '$line'$/";
			print STDERR $msg;
			print LOG $msg;
			$section_title = '';
			}
		$section_title = &remove_white_space( $section_title );

		print LOG "found$/\tsection number\t'$section_number'$/\tsection title\t'$section_title'$/";

		push @section_numbers, $section_number;
		push @section_titles, $section_title;
		}
	}

print LOG "found $#section_numbers section numbers and $#section_titles section titles$/";
if ( $#section_numbers != $#section_titles )
	{
	my $msg = "error: section number count non equal section titles count!$/";
	print LOG $msg;
	print STDERR $msg;
	}

# index in the toc arrays (@section_numbers and @section_titles):
my $toc_position = 0;

print LOG "going to close toc file$/";
close TOC;
# done with reading table of contents


# reading article:
my @article_lines;
print LOG "opening article input file $article_in$/";
open ( ART_IN, "< $article_in" )
	or die "cannot read from $article_in: $!$/";

while ( <ART_IN> )
	{
	my $line = $_;
	chomp $line;

	if ( $line =~ m/^=.*=\s*$/ )
		{
		# TODO hier weiter:
		# am ende von $article kontrollieren, ob die anzahl stimmt
		print LOG "$/found header in line $.: '$line'$/";
		my $line_without_markup = &remove_markup( $line );
		my $current_section_title = $section_titles[ $toc_position ];
		if ( $line_without_markup eq $current_section_title )
			{
			print LOG "match: section title in article is equal to section title in toc$/";

			print LOG "line: old:\t'$line'$/";
			$line =~ s/^=+//;
			$line =~ s/=+\s*//;
			my $level_markup = &get_level_markup( $section_numbers[ $toc_position ] );
			$line = $level_markup . $line . $level_markup;
			print LOG "line: new:\t'$line'$/";
			}
		else
			{
			my $msg = "warning: no match:$/section title in article:\t'$line_without_markup'$/section title in toc\t\t'$current_section_title'$/";
			print LOG $msg;
			print STDERR $msg;
			}

		$toc_position++;
		}
	# print ART_OUT $line . $/;
	push @article_lines, $line;
	}

print LOG "going to close article input file$/";
close ART_IN;


# writing corrected version of article:
print LOG "going to open article output file $article_out$/";
open ( ART_OUT, "> $article_out" )
	or die "cannot write to $article_out: $!$/";

foreach my $line ( @article_lines )
	{
	print ART_OUT $line . $/;
	}


print LOG "going to close article output file$/";
close ART_OUT;
# done with article

print LOG "going to close log file$/";
close LOG;

# input: the section number
# output: a certain number of '=' signs
sub get_level_markup( $ )
	{
	my $number = shift;
	if ( not defined $number )
		{
		# error
		my $msg = "error: input argument of &get_level_markup() is not defined!$/";
		print STDERR $msg;
		print LOG $msg;
		}
	elsif ( $number =~ m/^\d+$/ )
		{
		return '==';
		}
	elsif ( $number =~ m/^\d+\.\d+$/ )
		{
		return '===';
		}
	elsif ( $number =~ m/^\d+(\.\d+){2}$/ )
		{
		return '====';
		}
	elsif ( $number =~ m/^\d+(\.\d+){3}$/ )
		{
		return '=====';
		}
	elsif ( $number =~ m/^\d+(\.\d+){4}$/ )
		{
		return '======';
		}
	else
		{
		# error...
		my $msg = "warning: &get_level_markup() could not analyse input argument '$number'$/";
		print STDERR $msg;
		print LOG $msg;
		}
	}

# input: a string
# output: the string without leading and trailing spaces and tab stops
# 	and without double spaces and tab stops
sub remove_white_space( $ )
	{
	my $text = shift;
	if ( not defined $text )
		{
		# error
		my $msg = "error: input argument of &remove_white_space() is not defined in line $.!$/";
		print STDERR $msg;
		print LOG $msg;
		}
	print LOG "remove_white_space(): in:\t'$text'$/";
	$text =~ s/^\s+//;
	$text =~ s/\s+$//;
	# replace double spaces and tab stops by single space:
	$text =~ s/\s+/ /g;
	print LOG "remove_white_space(): out:\t'$text'$/";
	return $text;
	}

# input: a string
# output: the string without too much wiki-markup
sub remove_markup( $ )
	{
	my $text = shift;
	if ( not defined $text )
		{
		# error
		my $msg = "error: input argument of &remove_markup() is not defined in line $.!$/";
		print STDERR $msg;
		print LOG $msg;
		}
	print LOG "remove_markup(): in:\t'$text'$/";
	$text =~ s/'''//g;
	$text =~ s/''//g;
	$text =~ s/^=+//;
	$text =~ s/=+\s*$//;
	$text =~ s/(\[\[.*?\]\])/&remove_wikilinks($1)/eg;
	$text = &remove_white_space( $text );
	print LOG "remove_markup(): out:\t'$text'$/";
	return $text;
	}

sub remove_wikilinks( $ )
	{
	my $text = shift;
	if ( not defined $text )
		{
		# error
		my $msg = "error: input argument of &remove_wikilinks() is not defined in line $.!$/";
		print STDERR $msg;
		print LOG $msg;
		}
	my $new_text;
	if ( $text =~ m/\|/ )
		{
		$text =~ m/^\[\[[^\|]+\|(.*?)\]\]$/;
		$new_text = $1;
		}
	else
		{
		$text =~ m/^\[\[(.*?)\]\]$/;
		$new_text = $1;
		}
	print LOG "remove_wikilinks(): in:\t'$text'$/remove_wikilinks(): out:\t'$new_text'$/";
	return $new_text;
	}