Merge pull request #2212 from TwlyY29/bibtex-parser

Added a bibtex parser that extracts identifiers of entries in bib-fil…
2019-10-29 17:12:02 -07:00 · 2019-10-29 17:12:02 -07:00 · 8abe5342c5
parent 6e769d1289 1d94ba2c32
commit 8abe5342c5
11 changed files with 479 additions and 2 deletions
--- a/ctags/Makefile.am
+++ b/ctags/Makefile.am
@ -14,6 +14,7 @@ parsers = \
 	parsers/asciidoc.c \
 	parsers/asm.c \
 	parsers/basic.c \
+	parsers/bibtex.c \
 	parsers/c.c \
 	parsers/cobol.c \
 	parsers/iniconf.c \
--- a/ctags/main/parsers.h
+++ b/ctags/main/parsers.h
@ -65,6 +65,7 @@
 	GoParser, \
 	JsonParser, \
 	ZephirParser, \
-	PowerShellParser
+	PowerShellParser, \
+	BibtexParser

 #endif  /* CTAGS_MAIN_PARSERS_H */
--- a/ctags/parsers/bibtex.c
+++ b/ctags/parsers/bibtex.c
@ -0,0 +1,431 @@
+/*
+ *   Copyright (c) 2000-2001, Jérôme Plût
+ *   Copyright (c) 2006, Enrico Tröger
+ *   Copyright (c) 2019, Mirco Schönfeld
+ *
+ *   This source code is released for free distribution under the terms of the
+ *   GNU General Public License.
+ *
+ *   This module contains functions for generating tags for source files
+ *   for the BibTex formatting system. 
+ *   https://en.wikipedia.org/wiki/BibTeX
+ */
+
+/*
+ *	 INCLUDE FILES
+ */
+#include "general.h"	/* must always come first */
+#include <ctype.h>	/* to define isalpha () */
+#include <string.h>
+
+#include "debug.h"
+#include "entry.h"
+#include "keyword.h"
+#include "parse.h"
+#include "read.h"
+#include "routines.h"
+#include "vstring.h"
+
+/*
+ *	 MACROS
+ */
+#define isType(token,t)		(bool) ((token)->type == (t))
+#define isKeyword(token,k)	(bool) ((token)->keyword == (k))
+#define isIdentChar(c) \
+	(isalpha (c) || isdigit (c) || (c) == '_' || (c) == '-' || (c) == '+')
+
+/*
+ *	 DATA DECLARATIONS
+ */
+
+/*
+ * Used to specify type of keyword.
+ */
+enum eKeywordId {
+	KEYWORD_article,
+	KEYWORD_book,
+	KEYWORD_booklet,
+	KEYWORD_conference,
+	KEYWORD_inbook,
+	KEYWORD_incollection,
+	KEYWORD_inproceedings,
+	KEYWORD_manual,
+	KEYWORD_mastersthesis,
+	KEYWORD_misc,
+	KEYWORD_phdthesis,
+	KEYWORD_proceedings,
+	KEYWORD_string,
+	KEYWORD_techreport,
+	KEYWORD_unpublished
+};
+typedef int keywordId; /* to allow KEYWORD_NONE */
+
+enum eTokenType {
+	/* 0..255 are the byte's value.  Some are named for convenience */
+	TOKEN_OPEN_CURLY = '{',
+	/* above is special types */
+	TOKEN_UNDEFINED = 256,
+	TOKEN_KEYWORD,
+	TOKEN_IDENTIFIER
+};
+typedef int tokenType;
+
+typedef struct sTokenInfo {
+	tokenType		type;
+	keywordId		keyword;
+	vString *		string;
+	unsigned long 	lineNumber;
+	MIOPos 			filePosition;
+} tokenInfo;
+
+/*
+ *	DATA DEFINITIONS
+ */
+
+static langType Lang_bib;
+
+typedef enum {
+	BIBTAG_ARTICLE,
+	BIBTAG_BOOK,
+	BIBTAG_BOOKLET,
+	BIBTAG_CONFERENCE,
+	BIBTAG_INBOOK,
+	BIBTAG_INCOLLECTION,
+	BIBTAG_INPROCEEDINGS,
+	BIBTAG_MANUAL,
+	BIBTAG_MASTERSTHESIS,
+	BIBTAG_MISC,
+	BIBTAG_PHDTHESIS,
+	BIBTAG_PROCEEDINGS,
+	BIBTAG_STRING,
+	BIBTAG_TECHREPORT,
+	BIBTAG_UNPUBLISHED,
+	BIBTAG_COUNT
+} bibKind;
+
+static kindDefinition BibKinds [] = {
+	{ true,  'a', "article",				"article"				},
+	{ true,  'b', "book",						"book"					},
+	{ true,  'B', "booklet",				"booklet"				},
+	{ true,  'c', "conference",			"conference"		},
+	{ true,  'i', "inbook",					"inbook"				},
+	{ true,  'I', "incollection",		"incollection"	},
+	{ true,  'j', "inproceedings",	"inproceedings"	},
+	{ true,  'm', "manual",					"manual"				},
+	{ true,  'M', "mastersthesis",	"mastersthesis"	},
+	{ true,  'n', "misc",						"misc"					},
+	{ true,  'p', "phdthesis",			"phdthesis"			},
+	{ true,  'P', "proceedings",		"proceedings"		},
+	{ true,  's', "string",					"string"				},
+	{ true,  't', "techreport",			"techreport"		},
+	{ true,  'u', "unpublished",		"unpublished"		}
+};
+
+static const keywordTable BibKeywordTable [] = {
+	/* keyword			  keyword ID */
+	{ "article",	    KEYWORD_article				},
+	{ "book",	        KEYWORD_book				  },
+	{ "booklet",	    KEYWORD_booklet				},
+	{ "conference",	  KEYWORD_conference		},
+	{ "inbook",	      KEYWORD_inbook				},
+	{ "incollection",	KEYWORD_incollection	},
+	{ "inproceedings",KEYWORD_inproceedings	},
+	{ "manual",	      KEYWORD_manual				},
+	{ "mastersthesis",KEYWORD_mastersthesis	},
+	{ "misc",	        KEYWORD_misc				  },
+	{ "phdthesis",	  KEYWORD_phdthesis			},
+	{ "proceedings",	KEYWORD_proceedings		},
+	{ "string",				KEYWORD_string				},
+	{ "techreport",	  KEYWORD_techreport		},
+	{ "unpublished",	KEYWORD_unpublished		}
+};
+  
+/*
+ *	 FUNCTION DEFINITIONS
+ */
+
+static tokenInfo *newToken (void)
+{
+	tokenInfo *const token = xMalloc (1, tokenInfo);
+
+	token->type			= TOKEN_UNDEFINED;
+	token->keyword		= KEYWORD_NONE;
+	token->string		= vStringNew ();
+	token->lineNumber   = getInputLineNumber ();
+	token->filePosition = getInputFilePosition ();
+
+	return token;
+}
+
+static void deleteToken (tokenInfo *const token)
+{
+	vStringDelete (token->string);
+	eFree (token);
+}
+
+/*
+ *	 Tag generation functions
+ */
+static void makeBibTag (tokenInfo *const token, bibKind kind)
+{
+	if (BibKinds [kind].enabled)
+	{
+		const char *const name = vStringValue (token->string);
+		tagEntryInfo e;
+		initTagEntry (&e, name, kind);
+
+		e.lineNumber   = token->lineNumber;
+		e.filePosition = token->filePosition;
+
+		makeTagEntry (&e);
+	}
+}
+
+/*
+ *	 Parsing functions
+ */
+
+/*
+ *	Read a C identifier beginning with "firstChar" and places it into
+ *	"name".
+ */
+static void parseIdentifier (vString *const string, const int firstChar)
+{
+	int c = firstChar;
+	Assert (isIdentChar (c));
+	do
+	{
+		vStringPut (string, c);
+		c = getcFromInputFile ();
+	} while (c != EOF && isIdentChar (c));
+	if (c != EOF)
+		ungetcToInputFile (c);		/* unget non-identifier character */
+}
+
+static bool readToken (tokenInfo *const token)
+{
+	int c;
+
+	token->type			= TOKEN_UNDEFINED;
+	token->keyword		= KEYWORD_NONE;
+	vStringClear (token->string);
+
+getNextChar:
+
+	do
+	{
+		c = getcFromInputFile ();
+	}
+	while (c == '\t' || c == ' ' || c == '\n');
+
+	token->lineNumber   = getInputLineNumber ();
+	token->filePosition = getInputFilePosition ();
+
+	token->type = (unsigned char) c;
+	switch (c)
+	{
+		case EOF: return false;
+
+		case '@':
+					/*
+					 * All Bib entries start with an at symbol.
+					 * Check if the next character is an alpha character
+					 * else it is not a potential tex tag.
+					 */
+					c = getcFromInputFile ();
+					if (! isalpha (c))
+					  ungetcToInputFile (c);
+					else
+					{
+						vStringPut (token->string, '@');
+						parseIdentifier (token->string, c);
+						token->keyword = lookupCaseKeyword (vStringValue (token->string) + 1, Lang_bib);
+						if (isKeyword (token, KEYWORD_NONE))
+							token->type = TOKEN_IDENTIFIER;
+						else
+							token->type = TOKEN_KEYWORD;
+					}
+					break;
+		case '%':
+					skipToCharacterInInputFile ('\n'); /* % are single line comments */
+					goto getNextChar;
+					break;
+		default:
+					if (isIdentChar (c))
+					{
+						parseIdentifier (token->string, c);
+						token->type = TOKEN_IDENTIFIER;
+					}
+					break;
+	}
+	return true;
+}
+
+static void copyToken (tokenInfo *const dest, tokenInfo *const src)
+{
+	dest->lineNumber = src->lineNumber;
+	dest->filePosition = src->filePosition;
+	dest->type = src->type;
+	dest->keyword = src->keyword;
+	vStringCopy (dest->string, src->string);
+}
+
+/*
+ *	 Scanning functions
+ */
+
+static bool parseTag (tokenInfo *const token, bibKind kind)
+{
+	tokenInfo *	const name = newToken ();
+	vString *		currentid;
+	bool				eof = false;
+
+	currentid = vStringNew ();
+	/*
+	 * Bib entries are of these formats:
+	 *   @article{identifier,
+	 *   author="John Doe"}
+	 *
+	 * When a keyword is found, loop through all words up to
+	 * a comma brace for the tag name.
+	 *
+	 */
+	if (isType (token, TOKEN_KEYWORD))
+	{
+		copyToken (name, token);
+		if (!readToken (token))
+		{
+			eof = true;
+			goto out;
+		}
+	}
+
+	if (isType (token, TOKEN_OPEN_CURLY))
+	{
+		if (!readToken (token))
+		{
+			eof = true;
+			goto out;
+		}
+		if (isType (token, TOKEN_IDENTIFIER)){
+			vStringCat (currentid, token->string);
+			vStringStripTrailing (currentid);
+			if (vStringLength (currentid) > 0)
+			{
+				vStringCopy (name->string, currentid);
+				makeBibTag (name, kind);
+			}
+		}
+		else
+		{ // should find an identifier for bib item at first place
+			eof = true;
+			goto out;
+		}
+	}
+  
+
+ out:
+	deleteToken (name);
+	vStringDelete (currentid);
+	return eof;
+}
+
+static void parseBibFile (tokenInfo *const token)
+{
+	bool eof = false;
+
+	do
+	{
+		if (!readToken (token))
+			break;
+
+		if (isType (token, TOKEN_KEYWORD))
+		{
+			switch (token->keyword)
+			{
+				case KEYWORD_article:
+					eof = parseTag (token, BIBTAG_ARTICLE);
+					break;
+				case KEYWORD_book:
+					eof = parseTag (token, BIBTAG_BOOK);
+					break;
+				case KEYWORD_booklet:
+					eof = parseTag (token, BIBTAG_BOOKLET);
+					break;
+				case KEYWORD_conference:
+					eof = parseTag (token, BIBTAG_CONFERENCE);
+					break;
+				case KEYWORD_inbook:
+					eof = parseTag (token, BIBTAG_INBOOK);
+					break;
+				case KEYWORD_incollection:
+					eof = parseTag (token, BIBTAG_INCOLLECTION);
+					break;
+				case KEYWORD_inproceedings:
+					eof = parseTag (token, BIBTAG_INPROCEEDINGS);
+					break;
+				case KEYWORD_manual:
+					eof = parseTag (token, BIBTAG_MANUAL);
+					break;
+				case KEYWORD_mastersthesis:
+					eof = parseTag (token, BIBTAG_MASTERSTHESIS);
+					break;
+				case KEYWORD_misc:
+					eof = parseTag (token, BIBTAG_MISC);
+					break;
+				case KEYWORD_phdthesis:
+					eof = parseTag (token, BIBTAG_PHDTHESIS);
+					break;
+				case KEYWORD_proceedings:
+					eof = parseTag (token, BIBTAG_PROCEEDINGS);
+					break;
+				case KEYWORD_string:
+					eof = parseTag (token, BIBTAG_STRING);
+					break;
+				case KEYWORD_techreport:
+					eof = parseTag (token, BIBTAG_TECHREPORT);
+					break;
+				case KEYWORD_unpublished:
+					eof = parseTag (token, BIBTAG_UNPUBLISHED);
+					break;
+				default:
+					break;
+			}
+		}
+		if (eof)
+			break;
+	} while (true);
+}
+
+static void initialize (const langType language)
+{
+	Lang_bib = language;
+}
+
+static void findBibTags (void)
+{
+	tokenInfo *const token = newToken ();
+
+	parseBibFile (token);
+
+	deleteToken (token);
+}
+
+/* Create parser definition structure */
+extern parserDefinition* BibtexParser (void)
+{
+	Assert (ARRAY_SIZE (BibKinds) == BIBTAG_COUNT);
+	static const char *const extensions [] = { "bib", NULL };
+	parserDefinition *const def = parserNew ("BibTeX");
+	def->extensions = extensions;
+	/*
+	 * New definitions for parsing instead of regex
+	 */
+	def->kindTable		= BibKinds;
+	def->kindCount		= ARRAY_SIZE (BibKinds);
+	def->parser				= findBibTags;
+	def->initialize		= initialize;
+	def->keywordTable	= BibKeywordTable;
+	def->keywordCount	= ARRAY_SIZE (BibKeywordTable);
+	return def;
+}
--- a/data/Makefile.am
+++ b/data/Makefile.am
@ -11,6 +11,7 @@ filetypes = \
 	filedefs/filetypes.asciidoc \
 	filedefs/filetypes.asm \
 	filedefs/filetypes.batch \
+	filedefs/filetypes.bibtex \
 	filedefs/filetypes.c \
 	filedefs/filetypes.caml \
 	filedefs/filetypes.Clojure.conf \
--- a/data/filedefs/filetypes.bibtex
+++ b/data/filedefs/filetypes.bibtex
@ -0,0 +1,7 @@
+# For complete documentation of this file, please see Geany's main documentation
+
+[settings]
+# highlights commented lines
+lexer_filetype=LaTeX
+# default extension used when saving files
+extension=bib
--- a/data/filetype_extensions.conf
+++ b/data/filetype_extensions.conf
@ -10,6 +10,7 @@ Arduino=*.ino;*.pde;
 Asciidoc=*.asciidoc;*.adoc;
 ASM=*.asm;*.asm51;*.a51;*.s;*.S;*.sx;
 Batch=*.bat;*.cmd;*.nt;
+BibTeX=*.bib;
 CAML=*.ml;*.mli;
 C=*.c;*.xpm;
 C++=*.cpp;*.cxx;*.c++;*.cc;*.h;*.hpp;*.hxx;*.h++;*.hh;*.C;*.H;
@ -43,7 +44,7 @@ Java=*.java;*.jsp;
 Javascript=*.js;
 JSON=*.json;
 Kotlin=*.kt;*.kts;
-LaTeX=*.tex;*.sty;*.idx;*.ltx;*.latex;*.aux;*.bib;
+LaTeX=*.tex;*.sty;*.idx;*.ltx;*.latex;*.aux;
 Lisp=*.lisp;
 Lua=*.lua;
 Make=*.mak;*.mk;GNUmakefile;makefile;Makefile;makefile.*;Makefile.*;
--- a/src/filetypes.c
+++ b/src/filetypes.c
@ -160,6 +160,7 @@ static void init_builtin_filetypes(void)
 	FT_INIT( SQL,        SQL,          "SQL",              NULL,                      FILE,        MISC     );
 	FT_INIT( COBOL,      COBOL,        "COBOL",            NULL,                      SOURCE_FILE, COMPILED );
 	FT_INIT( LATEX,      LATEX,        "LaTeX",            NULL,                      SOURCE_FILE, MARKUP   );
+	FT_INIT( BIBTEX,     BIBTEX,       "BibTeX",           NULL,                      SOURCE_FILE, MARKUP   );
 	FT_INIT( VHDL,       VHDL,         "VHDL",             NULL,                      SOURCE_FILE, COMPILED );
 	FT_INIT( VERILOG,    VERILOG,      "Verilog",          NULL,                      SOURCE_FILE, COMPILED );
 	FT_INIT( DIFF,       DIFF,         "Diff",             NULL,                      FILE,        MISC     );
--- a/src/filetypes.h
+++ b/src/filetypes.h
@ -105,6 +105,7 @@ typedef enum
 	GEANY_FILETYPES_COFFEESCRIPT,
 	GEANY_FILETYPES_GO,
 	GEANY_FILETYPES_ZEPHIR,
+	GEANY_FILETYPES_BIBTEX,
 	/* ^ append items here */
 	GEANY_MAX_BUILT_IN_FILETYPES	/* Don't use this, use filetypes_array->len instead */
 }
--- a/src/symbols.c
+++ b/src/symbols.c
@ -525,6 +525,20 @@ static void add_top_level_items(GeanyDocument *doc)
 				NULL);
 			break;
 		}
+		case GEANY_FILETYPES_BIBTEX:
+		{
+			tag_list_add_groups(tag_store,
+				&(tv_iters.tag_function), _("Articles"), ICON_NONE,
+				&(tv_iters.tag_macro), _("Book Chapters"), ICON_NONE,
+				&(tv_iters.tag_class), _("Books & Conference Proceedings"), ICON_NONE,
+				&(tv_iters.tag_member), _("Conference Papers"), ICON_NONE,
+				&(tv_iters.tag_variable), _("Theses"), ICON_NONE,
+				&(tv_iters.tag_namespace), _("Strings"), ICON_NONE,
+				&(tv_iters.tag_externvar), _("Unpublished"), ICON_NONE,
+				&(tv_iters.tag_other), _("Other"), ICON_NONE,
+				NULL);
+			break;
+		}
 		case GEANY_FILETYPES_MATLAB:
 		{
 			tag_list_add_groups(tag_store,
--- a/src/tagmanager/tm_parser.c
+++ b/src/tagmanager/tm_parser.c
@ -124,6 +124,23 @@ static TMParserMapEntry map_LATEX[] = {
 	{'n', tm_tag_namespace_t},
 	{'s', tm_tag_struct_t},
 };
+static TMParserMapEntry map_BIBTEX[] = {
+	{'a', tm_tag_function_t},
+	{'b', tm_tag_class_t},
+	{'B', tm_tag_class_t},
+	{'c', tm_tag_member_t},
+	{'i', tm_tag_macro_t},
+	{'I', tm_tag_macro_t},
+	{'j', tm_tag_member_t},
+	{'m', tm_tag_other_t},
+	{'M', tm_tag_variable_t},
+	{'n', tm_tag_other_t},
+	{'p', tm_tag_variable_t},
+	{'P', tm_tag_class_t},
+	{'s', tm_tag_namespace_t},
+	{'t', tm_tag_other_t},
+	{'u', tm_tag_externvar_t},
+};

 static TMParserMapEntry map_ASM[] = {
 	{'d', tm_tag_macro_t},
@ -531,6 +548,7 @@ static TMParserMap parser_map[] = {
 	MAP_ENTRY(PHP),
 	MAP_ENTRY(PYTHON),
 	MAP_ENTRY(LATEX),
+	MAP_ENTRY(BIBTEX),
 	MAP_ENTRY(ASM),
 	MAP_ENTRY(CONF),
 	MAP_ENTRY(SQL),
--- a/src/tagmanager/tm_parser.h
+++ b/src/tagmanager/tm_parser.h
@ -109,6 +109,7 @@ enum
 	TM_PARSER_JSON,
 	TM_PARSER_ZEPHIR,
 	TM_PARSER_POWERSHELL,
+	TM_PARSER_BIBTEX,
 	TM_PARSER_COUNT
 };