/* This is a lex(1) file, see http://dinosaur.compilertools.net/
	* or http://en.wikipedia.org/wiki/Lex_programming_tool .
	*
	* Compilation on UNIX systems is done by
	*     make wileyEndNoteFilt
	* On other systems one may need to call lex or flex and cc explicitly:
	*     lex -8 -o wileyEndNoteFilt.c wileyEndNoteFilt.l
	*     cc [-s] [-O] [-D MSDOS] [-D WILEYENDNOTERIS] -o wileyEndNoteFilt wileyEndNoteFilt.c -ll
	*
	* Definition of the macro MSDOS means that the output is rewritten
	* with CR+LF line terminators (which alternatively can be achieved
	* by the standard unix2dos(1)).
	*
	* Definition of the macro WILEYENDNOTERIS means that the RIS type tags
	* "A1  - " and "A2  - " are handled supposing that there is only
	* one name per line and swapping first and last names.
	*
	* The executable works as a filter and converts author names produced
	* by downloading EndNote files from Wiley's  web pages such that they become
	* standardized. This means: the initials (first names) and last names of authors
	* are swapped and separated by a comma on output, and multiple
	* authors become separate lines.
	*
	* Use examples:
	*   cat *.enw | wileyEndNoteFilt
	*   cat *.enw | recode h1..h4 | wileyEndNoteFilt > tmp.end ; end2xml tmp.end > tmp.xml
	*
	* Notes:
	* The filter changes lines of the %A or %E tag containing at least one author.
	* This means (i) the author list is only parsed until a line feed, so author
	*   lists continued on lines that do not start with another tag will remain
	*   incomplete
	*   (ii) a tag followed by no author is copied through unchanged.
	*
	* Richard J. Mathar, 2010-01-21
	*/
%option noyywrap

%{
#include <string.h>
#include <stdlib.h>

/* remove trailing carriage return and line feeds from inp
* (this string may be changed on output).
*/
void trimlf(char *inp)
{
	/* point to the last non-0 character */
	char *c = inp+strlen(inp)-1 ;
	while ( *c == '\n'  || *c == '\r' )
		*c-- = '\0' ;
}

/* remove blanks at the start and/or end of inp
* (this string may be changed on output).
*/
void trim(char *inp)
{
	/* point to the last non-0 character */
	char *c = inp+strlen(inp)-1 ;
	while ( *c == ' ')
		*c-- = '\0' ;

	while( *inp == ' ')
		memmove(inp,inp+1,strlen(inp)) ;
}

#define WILEYENDNOTEFILT_MAX_AUTH 2048
#define WILEYENDNOTEFILT_MAX_TRAIL 32

/* Handle a single name without any separator
* @param inp A string of the form "firstname middle-name lastname".
*/
void revStr1(char *inp, const char *tag)
{

	/* Search for the last blank, usually right before the last name.
	* This does not yet handle cases like Marvin D. den Anker
	* with composite last names, or Henry Ford II with roman numbers like I
	* or II following...
	*/
	char * lastbl=strrchr(inp,' ') ;

	if ( lastbl != inp )
	{
		/* the revised string: lastname, firstinitls[, trail]; */
		char resul[WILEYENDNOTEFILT_MAX_AUTH] ;
	
		/* a trailing portion like Jr., II etc. German cases like Dr. Werner von Braun, where "Dr." is 
		* part of the trialing portion but syntactically in front of the first name are not handled
		* correctly. Similarly "Pope Paul Benedict" will yield strange results.
		* We also assume that the roman numerals are not followed by dots.
		*/
		char trail[WILEYENDNOTEFILT_MAX_TRAIL] ;

		int lastl ;
	
		memset(trail,'\0',WILEYENDNOTEFILT_MAX_TRAIL) ;
	
		/* back up if this is a composite name followed by Jr., Sr. or a dotless Roman numeral from I to VII */
		if ( strcmp(lastbl," I")==0 || strcmp(lastbl," II")==0 || strcmp(lastbl," III")==0 || strcmp(lastbl," Jr.")==0
			|| strcmp(lastbl," IV")==0 || strcmp(lastbl," V")==0 || strcmp(lastbl," VI")==0 || strcmp(lastbl," VII")==0 
			|| strcmp(lastbl," Sr.")==0
			)
		{
			/* copy the trailing piece over and chop it off the original string
			*/
			sprintf(trail,",%s",lastbl) ;
			*lastbl = '\0' ;
			lastbl=strrchr(inp,' ') ;
		}
	
		/* Start of the surname. Derived names of Spanish, Dutch or German origin. Does not catch the German
		* "Graefin" or similar titles which use diacritical vocals.
		* Start with the longest matches (that is: catch the "Baron" in "Baron von Munchhausen", not the "von").
		*/
		char * surn= strstr(inp," Baron ") ;	/* triggers also Baron von Munchhausen, for example */
		if ( surn == NULL)
			surn= strstr(inp," Duke ") ;
		if ( surn == NULL)
			surn= strstr(inp," Earl ") ;
		if ( surn == NULL)
			surn= strstr(inp," Graf ") ;	/* triggers also "Graf von", "Graf zu" etc */
		if ( surn == NULL)
			surn= strstr(inp," Gr\344fin ") ;	/* assume U+00E4 for the diaresis in UTF-8 */
		if ( surn == NULL)
			surn= strstr(inp," Herzog ") ;
		if ( surn == NULL)
			surn= strstr(inp," v. d. ") ;
		if ( surn == NULL)
			surn= strstr(inp," von ") ;	/* triggers also on "von der" */
		if ( surn == NULL)
			surn= strstr(inp," Von ") ;
		if ( surn == NULL)
			surn= strstr(inp," van ") ;	/* triggers also "van den", "van der", "van de" */
		if ( surn == NULL)
			surn= strstr(inp," Van ") ;	/* Van Morrisson is intepreted as a last name */
		if ( surn == NULL)
			surn= strstr(inp," da ") ;
		if ( surn == NULL)
			surn= strstr(inp," Da ") ;
		if ( surn == NULL)
			surn= strstr(inp," de ") ;
		if ( surn == NULL)
			surn= strstr(inp," De ") ;
		if ( surn == NULL)
			surn= strstr(inp," du ") ;
		if ( surn == NULL)
			surn= strstr(inp," Du ") ;
		if ( surn == NULL)
			surn= strstr(inp," do ") ;
		if ( surn == NULL)
			surn= strstr(inp," Do ") ;
		if ( surn == NULL)
			surn= strstr(inp," della ") ;
		if ( surn == NULL)
			surn= strstr(inp," Della ") ;
		if ( surn == NULL)
			surn= strstr(inp," le ") ;
		if ( surn == NULL)
			surn= strstr(inp," Le ") ;
		if ( surn == NULL)
			surn= strstr(inp," dos ") ;
		if ( surn == NULL)
			surn= strstr(inp," Dos ") ;
		if ( surn == NULL)
			surn= strstr(inp," ter ") ;
		if ( surn == NULL)
			surn= strstr(inp," Ter ") ;
		if ( surn == NULL)
			surn= strstr(inp," ten ") ;
		if ( surn == NULL)
			surn= strstr(inp," Ten ") ;
							/* the cases "e" and "i" are handled unsafely here:
							* probably another component of the family name
							* precedes these */
		if ( surn == NULL)
			surn= strstr(inp," e ") ;
		if ( surn == NULL)
			surn= strstr(inp," E ") ;
		if ( surn == NULL)
			surn= strstr(inp," i ") ;
		if ( surn == NULL)
			surn= strstr(inp," de ") ;
		if ( surn == NULL)
			surn= strstr(inp," De ") ;
		if ( surn == NULL)
			surn= strstr(inp," den ") ;
		if ( surn == NULL)
			surn= strstr(inp," Den ") ;

		/* if no such modifier is found: assume the last name starts at the last blank */
		if ( surn == NULL)
			surn= lastbl ;
	
		lastl= strlen(inp)-(surn-inp)-1 ;
		memset(resul,'\0',WILEYENDNOTEFILT_MAX_AUTH) ;
		strncpy(resul,surn+1,lastl) ;
	
		if ( surn != inp)
		{
			/* output separator comma: lastname, firstname. Firstname starts with a blank. */
			strcat(resul,",") ;
#ifdef WILEYENDNOTEFILT_RIS
			strcat(resul," ") ;
#endif
			strncat(resul,inp,surn-inp) ;
		}

		/* if there is a trailing portion append. Usually this is
		* just '\0' and appending does not hurt.
		*/
		strcat(resul,trail) ;

		trim(resul) ;
#ifdef MSDOS
		fprintf(yyout,"%s %s\r\n",tag,resul) ;
#else
		fprintf(yyout,"%s %s\n",tag,resul) ;
#endif
	}
	else
		/* If there is no blank, this is probably a single last name, which we copy as is
		* (without comma) to stdout.
		*/
#ifdef MSDOS
		fprintf(yyout,"%s%s\r\n",tag,inp) ;
#else
		fprintf(yyout,"%s%s\n",tag,inp) ;
#endif
}
#undef WILEYENDNOTEFILT_MAX_AUTH
#undef WILEYENDNOTEFILT_MAX_TRAIL

/** Split the line (without the starting "%A " or "%E " EndNote tag) into
* the individual authors (which are each followed by a comma).
* @param inp A string of the form "first-author, second-author, last-author,"
*            or "first-author, second-author, last-author" or "first-author".
* @param tag The "%A" or "%E"
*/
void revStr(const char *inp, const char *tag)
{
	/* strtok(3) modifies 'inp', so we construct a temporary copy.
	*/
	char *inpsave = (char *) malloc((strlen(inp)+1)*sizeof(char)) ;
	char *tok ;
	char *tokreent ;

	strcpy(inpsave,inp) ;
	trimlf(inpsave) ;
	/* Loop over all terminating separators (=commas) */
	tok = inpsave ;

#ifdef WILEYENDNOTEFILT_RIS
	trim(tok) ;
	revStr1(tok,tag) ;
#else


	tok = strtok_r(inpsave,",",&tokreent) ;
	while(tok)
	{

		/* skip leading white space in author names
		while(*tok == ' ')
			tok++ ;
		*/

		/* pass the "firstinit secndinit lastname" to the subroutine
		* without the comma separator.
		*/
		revStr1(tok,tag) ;
		tok = strtok_r(NULL,",",&tokreent) ;
	}
#endif
	free(inpsave) ;
}
%}

TAGA	"%A "
TAGARIS	"A1  - "
TAGE	"%E "
TAGERIS	"A2  - "
%%
{TAGA}.+\n { 

	/* if this is a tag followed by one or more authors, push the list
	* of authors, including the LF, the initial blank after the tag and all commas, to revStr().
	*/
	revStr(yytext+2,"%A") ;
	}
{TAGARIS}.+\n { 

#ifdef WILEYENDNOTEFILT_RIS
	/* if this is a tag followed by one or more authors, push the list
	* of authors, including the LF, the initial blank after the tag and all commas, to revStr().
	*/
	revStr(yytext+5,"A1  -") ;
#else
	fprintf(yyout,"%s",yytext) ;
#endif
	}

{TAGE}.+\n { 

	/* Same as above for the list of editors.
	*/
	revStr(yytext+2,"%E") ;
	}

{TAGERIS}.+\n { 

#ifdef WILEYENDNOTEFILT_RIS
	/* Same as above for the list of editors.
	*/
	revStr(yytext+5,"A2  -") ;
#else
	fprintf(yyout,"%s",yytext) ;
#endif
	}
%%

int main(int argc, char *argv[])
{
	++argv, --argc;  /* Skip over program name. */
	if (argc > 0)
		yyin = fopen(argv[0], "r");
	else
		yyin = stdin;
	yylex();
	return 0 ;
}