/***********************************************************************
 *               Copyright (C) 1995 Joe English
 *                   Freely redistributable
 ***********************************************************************
 *
 * rdsgmls.c,v 1.24 1998/11/20 03:48:50 joe Exp
 *
 * Author: 	Joe English
 * Created: 	Jan 1995
 * Description: read output of sgmls.
 * Bugs:
 * 	This is *really* short on error checking.
 *	It never tests the return value of malloc(),
 *	and input errors are handled by dumping core.
 *	Needless to say, it should be a *bit* more robust.
 *
 * 1998/11/20 03:48:50
 * 1.24
 */

#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "project.h"
#include "strmap.h"
#include "strmgt.h"
#include "pile.h"
#include "esis.h"
#include "esisp.h"
#include "lineout.h"	/* from sgmls source distribution */

typedef enum	/* character codes for SGMLS data escape characters */
{
	CHCODE_DATA,	/* single characters, \\, \nnn octal escapes */ 
	CHCODE_RE,	/* "\n", record-end */
	CHCODE_RS,	/* \012, record-start */
	CHCODE_SDATA,	/* \|, SDATA entity bracket */
	CHCODE_EOLN,	/* true newline */
	CHCODE_EOF,	/* EOF */
	CHCODE_ERR	/* unparseable */
} CHCODE;

/* Read a single character or \-escape sequence from sgmls.
 * Returns character and code.
 * %%% TODO: Handle  \#d;  escape sequences (decimal escape; new in nsgmls).
 */
static int rdchar(ESISInputStream stream, CHCODE *code_rtn)
{
    int ch = ESISgetc(stream);

    if (ch == '\n')
	*code_rtn = CHCODE_EOLN;
    else if (ch == -1)
	*code_rtn = CHCODE_EOF;
    else if (ch == '\\') {
	switch (ch = ESISgetc(stream)) {
	case '\\' :	*code_rtn = CHCODE_DATA;	break;
	case 'n'  : 	*code_rtn = CHCODE_RE;		break;
	case '|'  : 	*code_rtn = CHCODE_SDATA;	break;
	case '0' : case '1' : case '2' : case '3' :
	case '4' : case '5' : case '6' : case '7' :
	{ /* parse octal sequence */
	    char buf[4];
	    buf[0] = ch;
	    buf[1] = ESISgetc(stream);
	    if (isdigit(buf[1])) {
		buf[2] = ESISgetc(stream);
		buf[3] = '\0';
		if (isdigit(buf[2])) {
		    buf[3] = '\0';
		    ch = strtol(buf, NULL, 8);
		    if (ch == 012)
			*code_rtn = CHCODE_RS;
		    else
			*code_rtn = CHCODE_DATA;
		} else *code_rtn = CHCODE_ERR;
	    } else *code_rtn = CHCODE_ERR;
	    break;
	}
	default :
	    ASSERT(0, "Bad character in escape sequence");
	    *code_rtn = CHCODE_ERR;
	} /* switch */
    } /* if ch == '\\' */
    else {
	*code_rtn = CHCODE_DATA;
    }

    return ch;
}

/* Read (and ignore) the rest of an input line.
 * (This is for unused/unimplemented/unrecognized SGMLS events) 
 */
static void eatline(ESISInputStream stream)
{
    int ch;
    do {
	ch = ESISgetc(stream);
    } while (ch != '\n' && ch != -1);
}

/* Read next alphanumeric token and intern it;
 * set *eoln  = 1 if separator is '\n', 0 if it was a space
 */
static ESISToken rdtoken(ESISInputStream stream, int *eoln)
{
    static char *tokbuf = 0;
    static int tokbufsize = 0;
    int n, ch;

    if (!tokbuf)
	tokbuf = malloc(tokbufsize=80); 	/* %%%check */

    /* skip whitespace */
    do {
	ch = ESISgetc(stream);
    } while (isspace(ch));

    /* read token */
    n=0;
    while (ch != -1 && !isspace(ch))
    {
	tokbuf[n++] = ch;
	if (n >= tokbufsize)
	    tokbuf = realloc(tokbuf,tokbufsize *= 2);	/* %%%check */
	ch = ESISgetc(stream);
    }
    *eoln = (ch == '\n');
    tokbuf[n] = '\0';

    return intern(tokbuf);
}

/* Read remainder of input line,
 * unescaping data and storing it on 'p'.
 * Returns: '\0'-terminated string.
 */
static char *rdunescape(ESISInputStream stream, pile p)
{
    pstart(p);
    for (;;)
    {
	int ch;
	CHCODE chcode;
	ch = rdchar(stream,&chcode);
	switch (chcode) 
	{
	    case CHCODE_DATA:
	    case CHCODE_RE:
		paddch(p, ch);
		continue;
	    case CHCODE_EOLN :
	    case CHCODE_EOF :
		break;
	    case CHCODE_SDATA:
		/* %%% this can happen if SDATA entities are referenced
		 * %%% in CDATA declared attribute values. Ignore it.
		 */
		continue;
	    case CHCODE_RS :
		/* This can happen if there are newlines in PIs.
		 * Why, I don't know, but ignore it.
		 */
		continue;
	    default :
		ASSERT(0, "Bad character in escape sequence");
	}
	break;
    }
    paddch(p,'\0');
    return pfinish(p);
}

/* Read and parse character data record
 * add data nodes (CDATA, SDATA, RE) as children of specified nodes.
 * return: first node read (there can be more than one...)
 * %%% AIEEE!!! LOGIC!!!
 */
static void rddata(ESISBuilder ep, ESISInputStream stream)
{
    pile p = ep->datapile;
    char *text;
    int ch;
    CHCODE chcode;
    enum { s_nonode, s_incdata, s_insdata, s_readre, s_end } 
	state = s_nonode, nextstate = s_nonode;

    do {
	nextstate = state;

	ASSERT(state == s_nonode || state == s_insdata || state == s_incdata,
		"Bad state");

	ch = rdchar(stream, &chcode);

	switch (chcode) {
	    case CHCODE_EOLN:
	    case CHCODE_EOF:
		nextstate = s_end;
		break;
	    case CHCODE_DATA:
	    addch:
		if (state == s_nonode) {
		    pstart(p);
		    state = s_incdata;
		}
		paddch(p,ch);
		continue;
	    case CHCODE_RS:	/* ignore */
		continue;
	    case CHCODE_RE:
		if (state == s_insdata) {
		    ch = '\n';
		    goto addch;
		}
		nextstate = s_readre;
		break;
	    case CHCODE_SDATA:
		nextstate = (state == s_insdata) ? s_nonode : s_insdata;
		break;
	    default :
		ASSERT(0, "Bad chcode");
	}

	/* leave current state: */
leavestate:
	if (state == s_incdata || state == s_insdata) {
	    paddch(p,'\0');
	    text = pfinish(p);
	    esis_create_datanode(
		ep, state == s_incdata ? EN_CDATA : EN_SDATA, text);
	} else if (state == s_readre) {
	    esis_create_datanode(ep,EN_RE, "\n");
	}

	/* enter new state: */
	state = nextstate;
	if (state == s_insdata)
	    pstart(p);
	if (state == s_readre) {
	    nextstate = s_nonode;
	    goto leavestate;
	}
    } while (state != s_end);

    return;
}

/*
 * Aname val
 * Dename name val
 * Read and parse the "name val" part of an attribute record,
 * add attribute to specified node. 
 * VAL is one of:
 * 	  IMPLIED
 * 	  CDATA	data
 * 	  NOTATION nname
 * 	  ENTITY name...
 * 	  TOKEN	token...
 *
 * BUGS: Does not distinguish between CDATA and SDATA in
 * CDATA attribute values.
 * Should treat different declared value types differently... 
 */

static ESISNode rdattribute(ESISNode n, ESISInputStream stream, pile p)
{
    ESISToken attname, dv;
    int eoln;
    char *attval;

    attname = rdtoken(stream, &eoln);
    ASSERT(!eoln, "attribute ended prematurely");

    dv = rdtoken(stream, &eoln);
    if (!strcmp(dv,"IMPLIED")) 
    {
	attval = 0;
	ASSERT(eoln,"Extra data after IMPLIED attribute record");
    }
    else 
    {
	attval = rdunescape(stream, p); eoln = 1;
	if (!strcmp(dv,"NOTATION"))
	    esis_setprop(n, ENTPROP_NOTATION, attval);
	/* %%% other cases ? */
    }
    return esis_create_attribute(n, attname, attval);
}

/*
 * "All data in syntactic content is a pseudo-element. [...]
 * References to data entities that are not replaced in ESIS
 * are treated as peers of characters"
 * Charles Goldfarb in <comp-std-sgml@naggum.no> 28 Apr 95
 */
static void ensure_pel(ESISBuilder ep)
{
    if (ep->curnode->type == EN_PEL)
	return;
    ASSERT(ep->curnode->type == EN_EL, "data in non-EL node"); 
    esis_open_node(ep, EN_PEL);
    return;
}

static void ensure_not_pel(ESISBuilder ep)
{
    if (ep->curnode->type == EN_PEL) 
	(void)esis_close_node(ep);
}

/* estream_load_sgmls:
 * Reads SGMLS event stream, builds an ESIS tree.
 * Returns: new ESISDocument
 */
ESISDocument estream_load_sgmls(ESISInputStream stream)
{
    int done = 0;
    ESISNode newelement = 0;
    char *sysid = 0;
    char *pubid = 0;
    int eoln = 0;
    ESISBuilder ep = esis_builder_start();
    pile p = ep->datapile;

    while (!done)
    {
	int code = ESISgetc(stream);

	switch (code)
	{
	case -1 :
	    done = 1;
	    break;

	case CONFORMING_CODE:
	    /* ASSERT "current node is document root"
	     * return EOF
	     */
	    eatline(stream);
	    done = 1;
	    break;

	/*
	 * Auxilliary information codes:
	 */
	case SYSID_CODE:
	case PUBID_CODE:
	{
	    char *str;
	    pilemark m;
	    m = pmark(p);
	    str = rdunescape(stream,p);
	    if (code == SYSID_CODE) {
		ASSERT(!sysid, "Input error: sysid read but not consumed");
		sysid = malloc(strlen(str)+1); strcpy(sysid,str);
	    } else {
		ASSERT(!pubid, "Input error: pubid read but not consumed");
		pubid = malloc(strlen(str)+1); strcpy(pubid,str);
	    }
	    prelease(p,m);
	    continue;
	}

	case FILE_CODE:
	case LOCATION_CODE:

	case LINK_ATTRIBUTE_CODE:
	case INCLUDED_ELEMENT_CODE:
	case DEFINE_EXTERNAL_TEXT_ENTITY_CODE:
	    /* ignore these for now */
	    eatline(stream);
	    continue;

	/*
	 * Declarations:
	 */
	case DEFINE_NOTATION_CODE:	/* %%% not used */
	{
	    (void)rdtoken(stream,&eoln);	/* notation name */
	    ASSERT(eoln, "Misparsed NOTATION code");
	    if (pubid) free(pubid);	/* %%% save these */
	    if (sysid) free(sysid);	/* %%% map name->pub,sysid */
	    pubid = sysid = 0;
	    continue;
	}

        /* Dename name val */
	case DATA_ATTRIBUTE_CODE:
	{
	    ESISNode ent;
	    ESISToken name = rdtoken(stream, &eoln);
	    ASSERT(!eoln, "Incomplete data attribute definition");
	    ent = esis_find_entity(ep, name);
	    ASSERT(ent, "Data attribute definition for nonexistant entity");
	    rdattribute(ent, stream, p);
	    continue;
	}

        /* Eename typ nname */
	/* Iename typ */
	/* Sename */
	/* type in { CDATA, NDATA, SDATA } */
	/* %%% logic is way too convoluted */
	case DEFINE_EXTERNAL_ENTITY_CODE:
	case DEFINE_INTERNAL_ENTITY_CODE:
	case DEFINE_SUBDOC_ENTITY_CODE:
	{
	    ESISNode ent;
	    ESISToken nname, ename;
	    ename = rdtoken(stream, &eoln);
	    ent = esis_create_entity(ep, ename);
	    if (code != DEFINE_INTERNAL_ENTITY_CODE) {
		if (sysid) {
		    esis_setprop(ent, ENTPROP_SYSID, sysid); 
		    free(sysid);
		}
		if (pubid) {
		    esis_setprop(ent, ENTPROP_PUBID, pubid);
		    free(pubid);
		}
		sysid=pubid=0;
	    }
	    if (code == DEFINE_SUBDOC_ENTITY_CODE) {
		ASSERT(eoln, "Bad S command");
		continue;
	    }
	    ASSERT(!eoln,"Incomplete code");
	    (void)rdtoken(stream, &eoln); /* %%% entity type; not used */
	    if (code == DEFINE_EXTERNAL_ENTITY_CODE) {
		nname = rdtoken(stream, &eoln);
		esis_setprop(ent, ENTPROP_NOTATION, nname);
		/* %%% set props: NOTSYSID, NOTPUBID */
	    } else {
		ent->text = rdunescape(stream,p);
		eoln = 1;
	    }
	    ASSERT(eoln,"Bad entity defn");
	    continue;
	}

	case APPINFO_CODE:
	    /* %%% ignore this for now */
	    eatline(stream);
	    continue;

	/*+++
	 * Event codes:
	 */
	case DATA_CODE:
	{
	    ensure_pel(ep);
	    rddata(ep, stream);
	    break;
	}

	/* &name */
	case REFERENCE_ENTITY_CODE:
	{
	    ESISToken ename = rdtoken(stream,&eoln);
	    ESISNode ent = esis_find_entity(ep, ename);
	    ensure_pel(ep);
	    esis_open_node(ep,EN_REFERENCE);
	    ep->curnode->reference = ent;
	    ep->curnode->name = ename;
	    esis_close_node(ep);
	    break;
	}

	/* ?data */
	case PI_CODE:
	{
	    /* PIs are not addressible in HyTime, so they
	     * can appear anywhere in the tree.  In particular,
	     * a PEL node is *not* created.
	     */
	    char *text = rdunescape(stream,p);
	    esis_create_datanode(ep,EN_PI, text);
	    break;
	}

	/*
	 * Elements:
	 */
	case ATTRIBUTE_CODE:
	/* if (!newelement), newelement = new EL node (no GI, added later);
	 * create AT attribute of newelement.
	 * Continue.
	 */
	    if (!newelement) {
		ensure_not_pel(ep);
		newelement = esis_open_node(ep,EN_EL);
	    }
	    rdattribute(newelement, stream, p);
	    continue;
	case START_CODE:
	/* if newelement (created when attributes seen) assign GI
	 * else create new EL node.
	 * Return.
	 */
	    if (!newelement) {
		ensure_not_pel(ep);
		newelement = esis_open_node(ep,EN_EL);
	    }
	    newelement->name = rdtoken(stream, &eoln);
	    ASSERT(eoln, "Bad ( code)");
	    newelement = NULL;
	    break;
	case END_CODE:
	{
#if DEBUG
	    ESISToken gi = 
#endif
	    rdtoken(stream, &eoln);
	    ensure_not_pel(ep);
	    ASSERT(ep->curnode->type == EN_EL, "totally out of sync");
	    ASSERT(ep->curnode->name == gi, "out of sync");
	    esis_close_node(ep);
	    break;
	}

	case START_SUBDOC_CODE:
	case END_SUBDOC_CODE:
	{
	    (void)rdtoken(stream, &eoln);/* %%% document entity name; unused */
	    ASSERT(eoln, "Misgrokked SUBDOC event");
	    continue;
	    /* %%% should create new SD node, DTD mgt */
	    /* %%% create PEL node?  I think so... */
	}

	default:
	    ASSERT(0,"Forgot to implement an event type...");
	    eatline(stream);
	    continue;
	}

	ASSERT(pubid == 0, "PUBID read but not used");
	ASSERT(sysid == 0, "SYSID read but not used");
	ASSERT(newelement == 0, "Attributes read but not used");
    } /* while (!done...) */

    if (done == 1)
    {	/* Success */
	return esis_builder_finish(ep);
    } else {
	esis_free_document(esis_builder_finish(ep));
	return NULL;
    }
}

/*EOF*/
