package xm::pfe::wordset2words;
use strict;
use xm::o;

sub DESC
{"
  this script is derived from wordset2index 
  - but instead of generating one xml-page per wordset, there is now
  one page per export-entry. However the filenames are along the 
  lines of the wordset, and therefore the grab/index process is
  widely identical. We just need another step in the middle to
  cut out the export-entries. resolve/spitout are very similar
  after that. Therefore just like wordset2index, we will...
  get a list of wordsets, walk the text of each wordsetfile, 
  create an internal database of all XREFWORDREF and of course
  the defintion, make the xrefs filenamelike to get them spit out
  a bit later. When the xrefwordref-database is complete, walk 
  the texts and attach to <XREF>-data the appropriate href to have
  an exact cross-reference in html' url-encoding.
"}

sub ARGS { return xm::o::args_stdin(@_,DESC); }

sub DO
{
    my $in = shift;
    my $out = "";
    my $wordset;
    my $word_id;
    my $key;
    my @keys;
    my $i;
    my $w;
    my $word;
    my $index = "index-words";
    $index = $o{"index-words"} if exists $o{"index-words"};
    print STDERR "<$index> ";

    # stage 1 : grab the wordsets

    my $getNAMEWORDSET = sub {
	if ($_[0] =~ m{ <NAMEWORDSET\b[^<>]*> 
			    ((?:.(?!</?NAMEWORDSET\b))*.)
				</NAMEWORDSET\b[^<>]*> }sx)
	{ return $1; }
	else { return ""; }
    };

    my $getCSTRWORDCNT = sub {
	if ($_[0] =~ m{ <CSTRWORDCNT\b[^<>]*> 
			    ((?:.(?!</?CSTRWORDCNT\b))*.)
				</CSTRWORDCNT\b[^<>]*> }sx)
	{ return $1; }
	else { return ""; }
    };

    my $getXREFWORDREF = sub {
	if ($_[0] =~ m{ <XREFWORDREF\b[^<>]*> 
			    ((?:.(?!</?XREFWORDREF\b))*.)
				</XREFWORDREF\b[^<>]*> }sx)
	{ return $1; }
	else { return ""; }
    };

    my $useXDEFSTACK = sub {
	if ($_[0] =~ m{ (<XDEFSTACK\b[^<>]*>) 
			    ((?:.(?!</?XDEFSTACK\b))*.)
				(</XDEFSTACK\b[^<>]*>) }sx)
	{ return $1.$2.$3; }
	else { return ""; }
    };
		

    $in =~ s{ (<ITEMWORDSET\b[^<>]*>)  
		  ((?:.(?!</?ITEMWORDSET\b))*.) (</ITEMWORDSET\b[^<>]*>)
		  }
    { 
	$out = $1.$2.$3; 
	$key = &$getNAMEWORDSET($out); # print STDERR "<!$key>";
      
	if (length $key)
	{
	    push @keys, $key;
	    $$wordset{$key}{text} = $out;
	    $$wordset{$key}{desc} = &$getCSTRWORDCNT($out);
	};
	""
	}gsex;

    # stage 2 : grab XREFWORDREFs, enumerate and a-name them.

    for $key (@keys)
    {
	$i = 1;
	$$wordset{$key}{text} =~
	    s{ (<XREFWORDREF\b[^<>]*>)  
		   ((?:.(?!</?XREFWORDREF\b))*.) (</XREFWORDREF\b[^<>]*>)
		   }
	{ 
	    $$wordset{$key}{name}{$2} = "w-$key-0$i.html";
	    $i++;
	    $1.$2.$3
	    }gsex;
	$$wordset{$key}{refs} = $i - 1;
	# print STDERR "<$key:$i>";
    }
     
    # stage 3 : build a table of XREFWORDREFs across all wordsets

    for $key (@keys)
    {
	for $i (keys %{$$wordset{$key}{name}})
	{
	    if (not exists $$word_id{$i})
	    {
		$$word_id{$i} = $$wordset{$key}{name}{$i};
	    }
	}
    }

    # stage 4 : resolve XREFs, prefer wordset neighbours, then globals.

    for $key (@keys)
    {
	$$wordset{$key}{text} =~
	    s{ (<XREF\b)([^<>]*>)  
		   ((?:.(?!</?XREF\b))*.) (</XREF\b)([^<>]*>)
		   }
	{ 
	    if (exists $$wordset{$key}{name}{$3})
	    {
		$out = $1." href=\"".$$wordset{$key}{name}{$3}."\">".$2
		    .$3.$4." href".$5;
	    } elsif (exists $$word_id{$3})
	    {
		$out = $1." href=\"".$$word_id{$3}."\"".$2
		    .$3.$4." href".$5;
	    }else
	    { # it may or not be a good solution to href nonresolvables.
		$out = $1." href=\"$index.html\"".$2
		    .$3.$4." href".$5;
	    };
	    $out
	    }gsex;
    }

    # stage 5 : gripe for itemwordef, and put it under its filename.

    for $key (@keys)
    {
	$$wordset{$key}{text} =~
	    s{ (<ITEMWORDREF\b[^<>]*>)  
		   ((?:.(?!</?ITEMWORDREF\b))*.) (</ITEMWORDREF\b[^<>]*>)
		   }
	{ 
	    $out = $1.$2.$3;
	    $i = &$getXREFWORDREF($out);
	    
	    if (length $i and exists $$wordset{$key}{name}{$i})
	    {
		$$word{$key}{$i}{file} = $$wordset{$key}{name}{$i};
		$$word{$key}{$i}{text} = $out;
	    };
	    ""
	    }gsex;
    }


    # stage 6 : create the words html-xml's and $index.xml
    
    $i = 1;
    $out = ""; # the return of this sub is the index-list.
    my $idx; # a hash (!!) for the index-file
    my $F; 
    for $key (@keys)
    {
	for $w (keys %{$$word{$key}})
	{
	    $F = $$word{$key}{$w}{file};
	    $$idx{"$w<$i>"} .= "<ITEMWORDENTRY>\n"
		."<XREFWORDREF href=\"$F\">".$w."</XREFWORDREF href>\n"
		    .&$useXDEFSTACK($$word{$key}{$w}{text})."\n"
			."<NAMEWORDSETINFO>"." -- "."</NAMEWORDSETINFO>\n"
			    ."<NAMEWORDSET>".$key."</NAMEWORDSET>\n"
				."</ITEMWORDENTRY>\n";
	    $i++;
	    $F =~ s/.html$/.xml/;
	    open F,">$F" or next;
	    print F "<title>",$w,"</title>\n";
	    print F $$word{$key}{$w}{text};
	    close F;
	    $out .= $F."\n";
	}
    }

    $F = "$index.xml";
    if (open F,">$F")
    {
	for $w (sort keys %$idx)
	{
	    print F $$idx{$w};
	}
	close F;
	$out .= $F."\n";
    }

    print STDERR "</$index>\n";
    return $out;
}

1;

