Merge pull request #2212 from TwlyY29/bibtex-parser

Added a bibtex parser that extracts identifiers of entries in bib-fil…
master
Matthew Brush 2019-10-29 17:12:02 -07:00 committed by GitHub
commit 8abe5342c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 479 additions and 2 deletions

View File

@ -14,6 +14,7 @@ parsers = \
parsers/asciidoc.c \
parsers/asm.c \
parsers/basic.c \
parsers/bibtex.c \
parsers/c.c \
parsers/cobol.c \
parsers/iniconf.c \

View File

@ -65,6 +65,7 @@
GoParser, \
JsonParser, \
ZephirParser, \
PowerShellParser
PowerShellParser, \
BibtexParser
#endif /* CTAGS_MAIN_PARSERS_H */

431
ctags/parsers/bibtex.c Normal file
View File

@ -0,0 +1,431 @@
/*
* Copyright (c) 2000-2001, Jérôme Plût
* Copyright (c) 2006, Enrico Tröger
* Copyright (c) 2019, Mirco Schönfeld
*
* This source code is released for free distribution under the terms of the
* GNU General Public License.
*
* This module contains functions for generating tags for source files
* for the BibTex formatting system.
* https://en.wikipedia.org/wiki/BibTeX
*/
/*
* INCLUDE FILES
*/
#include "general.h" /* must always come first */
#include <ctype.h> /* to define isalpha () */
#include <string.h>
#include "debug.h"
#include "entry.h"
#include "keyword.h"
#include "parse.h"
#include "read.h"
#include "routines.h"
#include "vstring.h"
/*
* MACROS
*/
#define isType(token,t) (bool) ((token)->type == (t))
#define isKeyword(token,k) (bool) ((token)->keyword == (k))
#define isIdentChar(c) \
(isalpha (c) || isdigit (c) || (c) == '_' || (c) == '-' || (c) == '+')
/*
* DATA DECLARATIONS
*/
/*
* Used to specify type of keyword.
*/
enum eKeywordId {
KEYWORD_article,
KEYWORD_book,
KEYWORD_booklet,
KEYWORD_conference,
KEYWORD_inbook,
KEYWORD_incollection,
KEYWORD_inproceedings,
KEYWORD_manual,
KEYWORD_mastersthesis,
KEYWORD_misc,
KEYWORD_phdthesis,
KEYWORD_proceedings,
KEYWORD_string,
KEYWORD_techreport,
KEYWORD_unpublished
};
typedef int keywordId; /* to allow KEYWORD_NONE */
enum eTokenType {
/* 0..255 are the byte's value. Some are named for convenience */
TOKEN_OPEN_CURLY = '{',
/* above is special types */
TOKEN_UNDEFINED = 256,
TOKEN_KEYWORD,
TOKEN_IDENTIFIER
};
typedef int tokenType;
typedef struct sTokenInfo {
tokenType type;
keywordId keyword;
vString * string;
unsigned long lineNumber;
MIOPos filePosition;
} tokenInfo;
/*
* DATA DEFINITIONS
*/
static langType Lang_bib;
typedef enum {
BIBTAG_ARTICLE,
BIBTAG_BOOK,
BIBTAG_BOOKLET,
BIBTAG_CONFERENCE,
BIBTAG_INBOOK,
BIBTAG_INCOLLECTION,
BIBTAG_INPROCEEDINGS,
BIBTAG_MANUAL,
BIBTAG_MASTERSTHESIS,
BIBTAG_MISC,
BIBTAG_PHDTHESIS,
BIBTAG_PROCEEDINGS,
BIBTAG_STRING,
BIBTAG_TECHREPORT,
BIBTAG_UNPUBLISHED,
BIBTAG_COUNT
} bibKind;
static kindDefinition BibKinds [] = {
{ true, 'a', "article", "article" },
{ true, 'b', "book", "book" },
{ true, 'B', "booklet", "booklet" },
{ true, 'c', "conference", "conference" },
{ true, 'i', "inbook", "inbook" },
{ true, 'I', "incollection", "incollection" },
{ true, 'j', "inproceedings", "inproceedings" },
{ true, 'm', "manual", "manual" },
{ true, 'M', "mastersthesis", "mastersthesis" },
{ true, 'n', "misc", "misc" },
{ true, 'p', "phdthesis", "phdthesis" },
{ true, 'P', "proceedings", "proceedings" },
{ true, 's', "string", "string" },
{ true, 't', "techreport", "techreport" },
{ true, 'u', "unpublished", "unpublished" }
};
static const keywordTable BibKeywordTable [] = {
/* keyword keyword ID */
{ "article", KEYWORD_article },
{ "book", KEYWORD_book },
{ "booklet", KEYWORD_booklet },
{ "conference", KEYWORD_conference },
{ "inbook", KEYWORD_inbook },
{ "incollection", KEYWORD_incollection },
{ "inproceedings",KEYWORD_inproceedings },
{ "manual", KEYWORD_manual },
{ "mastersthesis",KEYWORD_mastersthesis },
{ "misc", KEYWORD_misc },
{ "phdthesis", KEYWORD_phdthesis },
{ "proceedings", KEYWORD_proceedings },
{ "string", KEYWORD_string },
{ "techreport", KEYWORD_techreport },
{ "unpublished", KEYWORD_unpublished }
};
/*
* FUNCTION DEFINITIONS
*/
static tokenInfo *newToken (void)
{
tokenInfo *const token = xMalloc (1, tokenInfo);
token->type = TOKEN_UNDEFINED;
token->keyword = KEYWORD_NONE;
token->string = vStringNew ();
token->lineNumber = getInputLineNumber ();
token->filePosition = getInputFilePosition ();
return token;
}
static void deleteToken (tokenInfo *const token)
{
vStringDelete (token->string);
eFree (token);
}
/*
* Tag generation functions
*/
static void makeBibTag (tokenInfo *const token, bibKind kind)
{
if (BibKinds [kind].enabled)
{
const char *const name = vStringValue (token->string);
tagEntryInfo e;
initTagEntry (&e, name, kind);
e.lineNumber = token->lineNumber;
e.filePosition = token->filePosition;
makeTagEntry (&e);
}
}
/*
* Parsing functions
*/
/*
* Read a C identifier beginning with "firstChar" and places it into
* "name".
*/
static void parseIdentifier (vString *const string, const int firstChar)
{
int c = firstChar;
Assert (isIdentChar (c));
do
{
vStringPut (string, c);
c = getcFromInputFile ();
} while (c != EOF && isIdentChar (c));
if (c != EOF)
ungetcToInputFile (c); /* unget non-identifier character */
}
static bool readToken (tokenInfo *const token)
{
int c;
token->type = TOKEN_UNDEFINED;
token->keyword = KEYWORD_NONE;
vStringClear (token->string);
getNextChar:
do
{
c = getcFromInputFile ();
}
while (c == '\t' || c == ' ' || c == '\n');
token->lineNumber = getInputLineNumber ();
token->filePosition = getInputFilePosition ();
token->type = (unsigned char) c;
switch (c)
{
case EOF: return false;
case '@':
/*
* All Bib entries start with an at symbol.
* Check if the next character is an alpha character
* else it is not a potential tex tag.
*/
c = getcFromInputFile ();
if (! isalpha (c))
ungetcToInputFile (c);
else
{
vStringPut (token->string, '@');
parseIdentifier (token->string, c);
token->keyword = lookupCaseKeyword (vStringValue (token->string) + 1, Lang_bib);
if (isKeyword (token, KEYWORD_NONE))
token->type = TOKEN_IDENTIFIER;
else
token->type = TOKEN_KEYWORD;
}
break;
case '%':
skipToCharacterInInputFile ('\n'); /* % are single line comments */
goto getNextChar;
break;
default:
if (isIdentChar (c))
{
parseIdentifier (token->string, c);
token->type = TOKEN_IDENTIFIER;
}
break;
}
return true;
}
static void copyToken (tokenInfo *const dest, tokenInfo *const src)
{
dest->lineNumber = src->lineNumber;
dest->filePosition = src->filePosition;
dest->type = src->type;
dest->keyword = src->keyword;
vStringCopy (dest->string, src->string);
}
/*
* Scanning functions
*/
static bool parseTag (tokenInfo *const token, bibKind kind)
{
tokenInfo * const name = newToken ();
vString * currentid;
bool eof = false;
currentid = vStringNew ();
/*
* Bib entries are of these formats:
* @article{identifier,
* author="John Doe"}
*
* When a keyword is found, loop through all words up to
* a comma brace for the tag name.
*
*/
if (isType (token, TOKEN_KEYWORD))
{
copyToken (name, token);
if (!readToken (token))
{
eof = true;
goto out;
}
}
if (isType (token, TOKEN_OPEN_CURLY))
{
if (!readToken (token))
{
eof = true;
goto out;
}
if (isType (token, TOKEN_IDENTIFIER)){
vStringCat (currentid, token->string);
vStringStripTrailing (currentid);
if (vStringLength (currentid) > 0)
{
vStringCopy (name->string, currentid);
makeBibTag (name, kind);
}
}
else
{ // should find an identifier for bib item at first place
eof = true;
goto out;
}
}
out:
deleteToken (name);
vStringDelete (currentid);
return eof;
}
static void parseBibFile (tokenInfo *const token)
{
bool eof = false;
do
{
if (!readToken (token))
break;
if (isType (token, TOKEN_KEYWORD))
{
switch (token->keyword)
{
case KEYWORD_article:
eof = parseTag (token, BIBTAG_ARTICLE);
break;
case KEYWORD_book:
eof = parseTag (token, BIBTAG_BOOK);
break;
case KEYWORD_booklet:
eof = parseTag (token, BIBTAG_BOOKLET);
break;
case KEYWORD_conference:
eof = parseTag (token, BIBTAG_CONFERENCE);
break;
case KEYWORD_inbook:
eof = parseTag (token, BIBTAG_INBOOK);
break;
case KEYWORD_incollection:
eof = parseTag (token, BIBTAG_INCOLLECTION);
break;
case KEYWORD_inproceedings:
eof = parseTag (token, BIBTAG_INPROCEEDINGS);
break;
case KEYWORD_manual:
eof = parseTag (token, BIBTAG_MANUAL);
break;
case KEYWORD_mastersthesis:
eof = parseTag (token, BIBTAG_MASTERSTHESIS);
break;
case KEYWORD_misc:
eof = parseTag (token, BIBTAG_MISC);
break;
case KEYWORD_phdthesis:
eof = parseTag (token, BIBTAG_PHDTHESIS);
break;
case KEYWORD_proceedings:
eof = parseTag (token, BIBTAG_PROCEEDINGS);
break;
case KEYWORD_string:
eof = parseTag (token, BIBTAG_STRING);
break;
case KEYWORD_techreport:
eof = parseTag (token, BIBTAG_TECHREPORT);
break;
case KEYWORD_unpublished:
eof = parseTag (token, BIBTAG_UNPUBLISHED);
break;
default:
break;
}
}
if (eof)
break;
} while (true);
}
static void initialize (const langType language)
{
Lang_bib = language;
}
static void findBibTags (void)
{
tokenInfo *const token = newToken ();
parseBibFile (token);
deleteToken (token);
}
/* Create parser definition structure */
extern parserDefinition* BibtexParser (void)
{
Assert (ARRAY_SIZE (BibKinds) == BIBTAG_COUNT);
static const char *const extensions [] = { "bib", NULL };
parserDefinition *const def = parserNew ("BibTeX");
def->extensions = extensions;
/*
* New definitions for parsing instead of regex
*/
def->kindTable = BibKinds;
def->kindCount = ARRAY_SIZE (BibKinds);
def->parser = findBibTags;
def->initialize = initialize;
def->keywordTable = BibKeywordTable;
def->keywordCount = ARRAY_SIZE (BibKeywordTable);
return def;
}

View File

@ -11,6 +11,7 @@ filetypes = \
filedefs/filetypes.asciidoc \
filedefs/filetypes.asm \
filedefs/filetypes.batch \
filedefs/filetypes.bibtex \
filedefs/filetypes.c \
filedefs/filetypes.caml \
filedefs/filetypes.Clojure.conf \

View File

@ -0,0 +1,7 @@
# For complete documentation of this file, please see Geany's main documentation
[settings]
# highlights commented lines
lexer_filetype=LaTeX
# default extension used when saving files
extension=bib

View File

@ -10,6 +10,7 @@ Arduino=*.ino;*.pde;
Asciidoc=*.asciidoc;*.adoc;
ASM=*.asm;*.asm51;*.a51;*.s;*.S;*.sx;
Batch=*.bat;*.cmd;*.nt;
BibTeX=*.bib;
CAML=*.ml;*.mli;
C=*.c;*.xpm;
C++=*.cpp;*.cxx;*.c++;*.cc;*.h;*.hpp;*.hxx;*.h++;*.hh;*.C;*.H;
@ -43,7 +44,7 @@ Java=*.java;*.jsp;
Javascript=*.js;
JSON=*.json;
Kotlin=*.kt;*.kts;
LaTeX=*.tex;*.sty;*.idx;*.ltx;*.latex;*.aux;*.bib;
LaTeX=*.tex;*.sty;*.idx;*.ltx;*.latex;*.aux;
Lisp=*.lisp;
Lua=*.lua;
Make=*.mak;*.mk;GNUmakefile;makefile;Makefile;makefile.*;Makefile.*;

View File

@ -160,6 +160,7 @@ static void init_builtin_filetypes(void)
FT_INIT( SQL, SQL, "SQL", NULL, FILE, MISC );
FT_INIT( COBOL, COBOL, "COBOL", NULL, SOURCE_FILE, COMPILED );
FT_INIT( LATEX, LATEX, "LaTeX", NULL, SOURCE_FILE, MARKUP );
FT_INIT( BIBTEX, BIBTEX, "BibTeX", NULL, SOURCE_FILE, MARKUP );
FT_INIT( VHDL, VHDL, "VHDL", NULL, SOURCE_FILE, COMPILED );
FT_INIT( VERILOG, VERILOG, "Verilog", NULL, SOURCE_FILE, COMPILED );
FT_INIT( DIFF, DIFF, "Diff", NULL, FILE, MISC );

View File

@ -105,6 +105,7 @@ typedef enum
GEANY_FILETYPES_COFFEESCRIPT,
GEANY_FILETYPES_GO,
GEANY_FILETYPES_ZEPHIR,
GEANY_FILETYPES_BIBTEX,
/* ^ append items here */
GEANY_MAX_BUILT_IN_FILETYPES /* Don't use this, use filetypes_array->len instead */
}

View File

@ -525,6 +525,20 @@ static void add_top_level_items(GeanyDocument *doc)
NULL);
break;
}
case GEANY_FILETYPES_BIBTEX:
{
tag_list_add_groups(tag_store,
&(tv_iters.tag_function), _("Articles"), ICON_NONE,
&(tv_iters.tag_macro), _("Book Chapters"), ICON_NONE,
&(tv_iters.tag_class), _("Books & Conference Proceedings"), ICON_NONE,
&(tv_iters.tag_member), _("Conference Papers"), ICON_NONE,
&(tv_iters.tag_variable), _("Theses"), ICON_NONE,
&(tv_iters.tag_namespace), _("Strings"), ICON_NONE,
&(tv_iters.tag_externvar), _("Unpublished"), ICON_NONE,
&(tv_iters.tag_other), _("Other"), ICON_NONE,
NULL);
break;
}
case GEANY_FILETYPES_MATLAB:
{
tag_list_add_groups(tag_store,

View File

@ -124,6 +124,23 @@ static TMParserMapEntry map_LATEX[] = {
{'n', tm_tag_namespace_t},
{'s', tm_tag_struct_t},
};
static TMParserMapEntry map_BIBTEX[] = {
{'a', tm_tag_function_t},
{'b', tm_tag_class_t},
{'B', tm_tag_class_t},
{'c', tm_tag_member_t},
{'i', tm_tag_macro_t},
{'I', tm_tag_macro_t},
{'j', tm_tag_member_t},
{'m', tm_tag_other_t},
{'M', tm_tag_variable_t},
{'n', tm_tag_other_t},
{'p', tm_tag_variable_t},
{'P', tm_tag_class_t},
{'s', tm_tag_namespace_t},
{'t', tm_tag_other_t},
{'u', tm_tag_externvar_t},
};
static TMParserMapEntry map_ASM[] = {
{'d', tm_tag_macro_t},
@ -531,6 +548,7 @@ static TMParserMap parser_map[] = {
MAP_ENTRY(PHP),
MAP_ENTRY(PYTHON),
MAP_ENTRY(LATEX),
MAP_ENTRY(BIBTEX),
MAP_ENTRY(ASM),
MAP_ENTRY(CONF),
MAP_ENTRY(SQL),

View File

@ -109,6 +109,7 @@ enum
TM_PARSER_JSON,
TM_PARSER_ZEPHIR,
TM_PARSER_POWERSHELL,
TM_PARSER_BIBTEX,
TM_PARSER_COUNT
};