PLUGIN html lang: "C++" version: "1.0" date: "2022-03-20" author: "Julien BRUGUIER" maintainer: "Julien BRUGUIER " synopsis: "Plugin to manage HTML documents" description: %{ This plugin is an HTML document manager allowing operations on the DOM. %} comment flex: "/*" " * " " */" comment bison: "/*" " * " " */" includes: %{ #include #include #include #include #include %} code: %{ struct yy_buffer_state; void htmlparserlex_init(void **); void htmlparserlex_destroy(void *); yy_buffer_state* htmlparser_scan_buffer(char *, size_t, void*); void htmlparser_delete_buffer(yy_buffer_state *buffer, void *scanner); int htmlparserparse(void *scanner, Html::Dom &dom); %} file source: "src/html.h" %{ #pragma once #include #include #include #include #include #include namespace Html { template struct Compare { bool operator() (const std::weak_ptr& l, const std::weak_ptr& r) { return l.lock() { typedef std::shared_ptr SP; typedef std::weak_ptr WP; virtual ~Element() {} virtual Element::SP clone() const = 0; virtual Element::SP id(const std::string& id) { return Element::SP(); } virtual std::vector in_class(const std::string& class_name) { std::vector elements; return elements; } template friend oStream& operator<<(oStream& os, const Element::SP& e) { e->print(os); return os; } virtual void print(std::ostream& os) const = 0; std::set > _parents; bool verification(const Element::SP& child) const { if(child==this->shared_from_this()) { return false; } for(const auto& p:_parents) { auto pp = p.lock(); if(not pp) continue; if(not pp->verification(child)) return false; } return true; } }; struct Text : public Element { Text(const std::string& text) :_text(text) {} virtual Element::SP clone() const override { return std::make_shared(_text); } virtual void print(std::ostream& os) const override { os << _text; } std::string _text; }; struct Comment : public Element { Comment(const std::string& text) :_text(text) {} virtual Element::SP clone() const override { return std::make_shared(_text); } virtual void print(std::ostream& os) const override { os << ""; } std::string _text; }; struct Meta : public Element { Meta(const std::string& text) :_text(text) {} virtual Element::SP clone() const override { return std::make_shared(_text); } virtual void print(std::ostream& os) const override { os << ""; } std::string _text; }; struct Node : public Element { Node(const std::string& type) :_type(type) {} virtual Element::SP clone() const override { auto e = std::make_shared(_type); e->_id = _id; e->_class = _class; e->_attributes = _attributes; for(const auto& c: _children) { e->_children.push_back(c->clone()); } return e; } void id_and_class() { auto it = _attributes.find("id"); if(it!=_attributes.end()) { _id = it->second; } it = _attributes.find("class"); if(it!=_attributes.end()) { _class.clear(); auto c = it->second; for( ; ; ) { auto itt=c.find(" "); if(itt==std::string::npos) { _class.insert(c); break; } if(itt>0) { _class.insert(c.substr(0,itt)); } c = c.substr(itt+1); } } } virtual Element::SP id(const std::string& id) { if(_id==id) return this->shared_from_this(); for(auto& c:_children) { auto e = c->id(id); if(static_cast(e)) return e; } return Element::SP(); } virtual std::vector in_class(const std::string& class_name) { std::vector elements; auto it = _class.find(class_name); if(it!=_class.end()) { elements.push_back(this->shared_from_this()); } for(auto& c:_children) { auto e = c->in_class(class_name); elements.insert(elements.end(),e.begin(),e.end()); } return elements; } virtual void print(std::ostream& os) const override { os << "<" << _type; for(const auto& a:_attributes) { os << " " << a.first << "=\"" << a.second << "\""; } if(_children.empty()) { os << "/>"; return; } os << ">"; for(const auto& c:_children) { os << c; } os << ""; } std::string _type; std::string _id; std::set _class; std::map _attributes; std::vector _children; }; struct Dom { using SP = std::shared_ptr; Dom() = default; Dom(const Dom& dom) :_error(dom._error) { for(const auto& r:dom._root) { _root.push_back(r->clone()); } } std::vector _root; std::string _error; Element::SP id(const std::string& id) const { for(auto& r:_root) { auto e = r->id(id); if(static_cast(e)) return e; } return Element::SP(); } std::vector in_class(const std::string& class_name) const { std::vector elements; for(auto& r:_root) { auto e = r->in_class(class_name); elements.insert(elements.end(),e.begin(),e.end()); } return elements; } template friend oStream& operator<<(oStream& os, const Dom& d) { if(not d._error.empty()) { os << "Error: " << d._error; } else { for(auto& e:d._root) { os << e; } } return os; } }; } %} file source: "src/html.cpp" %{ #include %} file flex: "src/parser/parser.lex.lpp" ${ %{ #include #include #define YY_USER_ACTION yylloc->first_line = yylloc->last_line = yylineno; %} %option nounput %option reentrant %option bison-bridge %option bison-locations %option noyywrap %s S_TAG %s S_COMMENT %s S_META %% "<" { BEGIN(S_TAG); return INFERIOR; } ">" { BEGIN(INITIAL); return SUPERIOR; } [a-z][a-z0-9_]* { yylval->string = std::string(yytext,yyleng); return IDENTIFIER; } \"([^"\n]|\\\\\")*\" { std::string token(yytext,yyleng); yylval->string = token.substr(1,token.size()-2); return STRING_VALUE; } \"([^"\n]|\\\\\")*\n { return _INVALID_; } \/ { return SLASH; } = { return EQUAL; } : { return COLON; } [ \t] { } (\n|[^<])+ { yylval->string = std::string(yytext,yyleng); yylloc->last_line+=std::count(yylval->string.begin(),yylval->string.end(),'\n'); return TEXT; } "!--" { BEGIN(S_COMMENT); return COMMENT; } (\n|[^-]|-[^-]|--[^>])* { std::string token(yytext,yyleng); yylloc->last_line+=std::count(yylval->string.begin(),yylval->string.end(),'\n'); yylval->string = token; return TEXT; } "-->" { BEGIN(INITIAL); return SUPERIOR; } \? { BEGIN(S_META); return META; } (\n|[^\?]|\?[^>])* { std::string token(yytext,yyleng); yylloc->last_line+=std::count(yylval->string.begin(),yylval->string.end(),'\n'); yylval->string = token; return TEXT; } "?>" { BEGIN(INITIAL); return SUPERIOR; } %% $} file bison: "src/parser/parser.syn.ypp" ${ %{ #include #define YYDEBUG 0 #include extern int htmlparsererror(YYLTYPE *llocp, void *scanner, Html::Dom& dom, std::string mesg); extern int htmlparserlex(YYSTYPE *lvalp, YYLTYPE *llocp, void *scanner); extern int htmlparserlex_init(void *scanner); extern int htmlparserlex_destroy(void *scanner); %} %locations %define api.pure full %define api.value.type { struct ParserValue } %param { void *scanner } %parse-param { Html::Dom& dom } %initial-action { #if YYDEBUG==1 htmlparserdebug=1; #endif } %code requires { struct ParserValue { std::string string; Html::Element::SP element; std::vector elements; std::map attributes; std::pair attribute; }; # define YYCOPY(Dst, Src, Count) \ do \ { \ YYSIZE_T yyi; \ for (yyi = 0; yyi < (Count); yyi++) \ (Dst)[yyi] = (Src)[yyi]; \ } \ while (0) } %token _INVALID_ %token IDENTIFIER STRING_VALUE TEXT %token INFERIOR SUPERIOR SLASH EQUAL COLON COMMENT META %type tag %type text comment meta element simple_node complex_node %type attribute %type element_list %type attribute_list %start dom %% dom: element_list { dom._root = $1; } ; element_list: { $$ = std::vector(); } | element_list element { $$=$1; $$.push_back($2); } ; element: text { $$=$1; } | comment { $$=$1; } | meta { $$=$1; } | simple_node { $$=$1; } | complex_node { $$=$1; } ; text: TEXT { $$ = std::make_shared($1); } ; comment: INFERIOR COMMENT TEXT SUPERIOR { $$ = std::make_shared($3); } ; meta: INFERIOR META TEXT SUPERIOR { $$ = std::make_shared($3); } ; simple_node: INFERIOR tag attribute_list SLASH SUPERIOR { auto node = std::make_shared($2); node->_attributes = $3; node->id_and_class(); $$=node; } ; complex_node: INFERIOR tag attribute_list SUPERIOR element_list INFERIOR SLASH tag SUPERIOR { if($2!=$8) { htmlparsererror(&@$,scanner,dom,"Mismatched node idendifier"); } auto node = std::make_shared($2); node->_attributes = $3; node->id_and_class(); node->_children = $5; $$=node; } ; tag: IDENTIFIER { $$ = $1; } | IDENTIFIER COLON IDENTIFIER { $$ = $1+":"+$3; } ; attribute_list: { $$ = std::map(); } | attribute_list attribute { $$=$1; $$.insert($2); } ; attribute: IDENTIFIER EQUAL STRING_VALUE { $$ = std::make_pair($1,$3); } | IDENTIFIER EQUAL IDENTIFIER { $$ = std::make_pair($1,$3); } ; %% int htmlparsererror(YYLTYPE *llocp, void *scanner, Html::Dom& dom, std::string mesg) { std::ostringstream oss; oss << "Invalid HTML text, line"; if(llocp->first_line==llocp->last_line) { oss << " " << llocp->first_line; } else { oss << "s " << llocp->first_line << "-" << llocp->last_line; } oss << ": " << mesg; dom._error = oss.str(); return 1; } $} file source: "src/parser/includes.h" %{ #include #include %} file make: "src/parser/Makefile.am" %{ AM_CPPFLAGS=-I${top_builddir} -I${top_srcdir} SUBDIRS= noinst_LTLIBRARIES=libparser.la BUILT_SOURCES=parser.lex.cpp parser.syn.cpp EXTRABUILTSOURCES=parser.syn.h CLEANFILES=parser.syn.output AM_YFLAGS=-d -v --file-prefix=y --name-prefix=htmlparser -o y.tab.c AM_LFLAGS=-P htmlparser -o lex.yy.c libparser_la_CXXFLAGS=$(AM_CXXFLAGS) -Wno-error=sign-compare libparser_la_SOURCES=parser.lex.lpp parser.syn.ypp includes.h libparser_la_LIBADD= libparser_la_LDFLAGS=-no-undefined -module %} patch: "configure.ac" %{ --- configure.ac.orig 2023-03-24 04:54:45.481596839 +0100 +++ configure.ac 2023-03-24 04:54:45.541597019 +0100 @@ -29,7 +29,10 @@ AC_CHECK_PROG([DIFF],[diff],[diff -u]) AC_PROG_CXX +AM_PROG_LEX +AC_PROG_YACC AC_LANG([C++]) +AM_CXXFLAGS="$(AM_CXXFLAGS) -std=c++14" AM_PROG_LIBTOOL AC_CHECK_TOOL([STRIP],[strip]) @@ -54,6 +57,7 @@ AC_CONFIG_FILES([ Makefile src/Makefile + src/parser/Makefile doc/Makefile test/Makefile ]) %} patch: "Makefile.am" %{ --- Makefile.am.orig 2023-03-24 04:54:45.535597001 +0100 +++ Makefile.am 2023-03-24 04:55:50.840792199 +0100 @@ -26,5 +26,5 @@ libsvmhtml_la_SOURCES= libsvmhtml_la_LDFLAGS=-module -ldl -Wl,-rpath -Wl,${pkglibdir} -Wl,-L$(SVM) -libsvmhtml_la_LIBADD=src/libplugin.la +libsvmhtml_la_LIBADD=src/libplugin.la src/parser/libparser.la libsvmhtml_la_LIBTOOLFLAGS=--tag=disable-static %} patch: "Makefile.local" %{ --- Makefile.local.orig 2023-03-24 04:54:45.535597001 +0100 +++ Makefile.local 2023-03-24 04:54:45.538597010 +0100 @@ -20,8 +20,8 @@ OPTIONS= -DEPENDENCIES= -GENERATED= +DEPENDENCIES=src/html.o src/parser/parser.syn.o src/parser/parser.lex.o +GENERATED=src/parser/parser.lex.cpp src/parser/parser.syn.{cpp,hpp,output} all: libsvmhtml.so @@ -31,6 +31,12 @@ .cpp.o: g++ -std=c++14 -o $@ -c $< -fPIC -DPIC $(OPTIONS) -I. +src/parser/parser.lex.cpp: src/parser/parser.lex.lpp + flex -P htmlparser -o $@ $< + +src/parser/parser.syn.cpp: src/parser/parser.syn.ypp + bison -d -v --file-prefix=y --name-prefix=htmlparser -o $@ $< + .PHONY: clean clean: rm -rf src/plugin.o $(DEPENDENCIES) $(GENERATED) libsvmhtml.so %} patch: "src/Makefile.am" %{ --- src/Makefile.am.orig 2023-03-24 04:54:45.497596887 +0100 +++ src/Makefile.am 2023-03-24 04:54:45.543597025 +0100 @@ -18,10 +18,10 @@ AM_CPPFLAGS=-I${top_builddir} -I${top_srcdir} -std=c++14 -SUBDIRS= +SUBDIRS=parser noinst_LTLIBRARIES=libplugin.la -libplugin_la_SOURCES=plugin.cpp plugin.h +libplugin_la_SOURCES=plugin.cpp plugin.h html.cpp html.h libplugin_la_LIBADD= libplugin_la_LDFLAGS=-no-undefined %} DEFINE TYPE html.dom %{ type_dom() :_dom(std::make_shared()) { } type_dom(const type_dom& dom) :_dom(std::make_shared(*(dom._dom))) { } Html::Dom::SP _dom; operator std::string () const { std::ostringstream oss; oss << (*_dom); return oss.str(); } %} delete default: %{} copy default: %{} constant: %{ void *scanner; ::htmlparserlex_init(&scanner); char *src = new char[value.size+2]; ::memcpy(src,value.string,value.size); src[value.size] = src[value.size+1] = '\0'; yy_buffer_state *buffer = ::htmlparser_scan_buffer(src,value.size+2,scanner); type_dom *t = new type_dom; ::htmlparserparse(scanner,*(t->_dom)); ::htmlparser_delete_buffer(buffer,scanner); delete [] src; ::htmlparserlex_destroy(scanner); return t; %} print default: %{} help: "This type contains a representation of a full HTML document." TYPE html.element %{ type_element() = default; type_element(const type_element& te) { _element = te._element->clone(); } operator std::string () const { std::ostringstream oss; oss << _element; return oss.str(); } Html::Element::SP _element; %} delete default: %{} copy default: %{} print default: %{} help: "This type contains a representation of an HTML node or text." INSTRUCTION html.text STR -> html.element %{ auto text = ARGV_VALUE(0,string); type_element *t = new type_element; t->_element = std::make_shared(std::string(text.string,text.size)); return NEW_PLUGIN(html,element,t); %} help: "This instruction creates an HTML element containing a raw text." INSTRUCTION html.comment STR -> html.element %{ auto text = ARGV_VALUE(0,string); type_element *t = new type_element; t->_element = std::make_shared(std::string(text.string,text.size)); return NEW_PLUGIN(html,element,t); %} help: "This instruction creates an HTML element containing the text surrounded by ." INSTRUCTION html.meta STR -> html.element %{ auto text = ARGV_VALUE(0,string); type_element *t = new type_element; t->_element = std::make_shared(std::string(text.string,text.size)); return NEW_PLUGIN(html,element,t); %} help: "This instruction creates an HTML element containing the text surrounded by ." INSTRUCTION html.node STR : type ( , STR : attribute_key = STR : attribute_value ) * ( { html.element + } ) ? -> html.element %{ auto type = ARGV_VALUE(0,string); auto node = std::make_shared(std::string(type.string,type.size)); std::map attributes; size_t index = 1; for( ; index children; for( ++index ; index_element->_parents.insert(node); children.push_back(child->_element); } type_element *t = new type_element; node->_attributes = attributes; node->_children = children; node->id_and_class(); t->_element = node; return NEW_PLUGIN(html,element,t); %} help: %{ This instruction creates an HTML node with a type (the name between < and >), attributes (key="value") and optional content (between and ). %} INSTRUCTION html.insert MUTABLE html.element : parent INT : index 'END' ? html.element : child %{ auto raw_parent = ARGV_PLUGIN(0,html,element); auto parent = std::dynamic_pointer_cast(raw_parent->_element); if(not parent) { ERROR_INTERNAL(FAILURE,"Parent is not a node"); } auto index = ARGV_VALUE(1,integer); size_t position = 2; if(::svm_parameter_type_is_keyword(svm,argv[position])) { ++position; index += parent->_children.size(); } auto child = ARGV_PLUGIN(position,html,element); if((index>=0) and (index_children.size())) { parent->_children.insert(parent->_children.begin()+index,child->_element); } else if(index==parent->_children.size()) { if(not parent->verification(child->_element)) { ERROR_INTERNAL(FAILURE,"HTML element cycle detected"); } parent->_children.push_back(child->_element); child->_element->_parents.insert(parent); } else { ERROR_INTERNAL(FAILURE,"Out of range"); } %} help: %{ This instruction inserts into a node a child element. The child element is insert at the specified index. When the keyword END is present, the index 0 matches the element after the last element (Inserting at 0 END means adding at the end). If the parent element is not a node, or when the insertion index is outside the node children list, a FAILURE interruption is raised. .I Adding as child an element being the parent of the node will also raise a FAILURE interruption. %} INSTRUCTION html.replace MUTABLE html.element : parent INT : index 'END' ? html.element : child %{ auto raw_parent = ARGV_PLUGIN(0,html,element); auto parent = std::dynamic_pointer_cast(raw_parent->_element); if(not parent) { ERROR_INTERNAL(FAILURE,"Parent is not a node"); } auto index = ARGV_VALUE(1,integer); size_t position = 2; if(::svm_parameter_type_is_keyword(svm,argv[position])) { ++position; index += parent->_children.size(); } auto child = ARGV_PLUGIN(position,html,element); if((index>=0) and (index_children.size())) { if(not parent->verification(child->_element)) { ERROR_INTERNAL(FAILURE,"HTML element cycle detected"); } auto it = parent->_children[index]->_parents.find(parent); parent->_children[index]->_parents.erase(it); parent->_children[index] = child->_element; child->_element->_parents.insert(parent); } else { ERROR_INTERNAL(FAILURE,"Out of range"); } %} help: %{ This instruction replaces a child of a node by another element. The child element is replaced at the specified index. When the keyword END is present, the index 0 matches the element after the last element (Inserting at 0 END means adding at the end). If the parent element is not a node, or when the replacement index is outside the node children list, a FAILURE interruption is raised. .I Adding as child an element being the parent of the node will also raise a FAILURE interruption. %} INSTRUCTION html.remove MUTABLE html.element : parent ( INT : index 'END' ? | html.element : child ) %{ auto raw_parent = ARGV_PLUGIN(0,html,element); auto parent = std::dynamic_pointer_cast(raw_parent->_element); if(not parent) { ERROR_INTERNAL(FAILURE,"Parent is not a node"); } SVM_Value value = ::svm_parameter_value_get(svm,argv[1]); if(::svm_value_type_is_integer(svm,value)) { auto index = ::svm_value_integer_get(svm,value); size_t position = 2; if(::svm_parameter_type_is_keyword(svm,argv[position])) { ++position; index += parent->_children.size(); } if((index>=0) and (index_children.size())) { auto it = parent->_children[index]->_parents.find(parent); parent->_children[index]->_parents.erase(it); parent->_children.erase(parent->_children.begin()+index); } else { ERROR_INTERNAL(FAILURE,"Out of range"); } } else { auto child = ARGV_PLUGIN(1,html,element); auto it = std::find(parent->_children.begin(),parent->_children.end(),child->_element); if(it!=parent->_children.end()) { auto itt = (*it)->_parents.find(parent); (*it)->_parents.erase(itt); parent->_children.erase(it); } } %} help: %{ This instruction removes a child of a node. The child element is removed at the specified index. When the keyword END is present, the index 0 matches the element after the last element (Inserting at 0 END means adding at the end). If the parent element is not a node, or when the replacement index is outside the node children list, a FAILURE interruption is raised. %} INSTRUCTION html.dom html.element + -> html.dom %{ type_dom *t = new type_dom; for(size_t index=0 ; index_dom->_root.push_back(element->_element); } return NEW_PLUGIN(html,dom,t); %} help: %{ This instruction transforms a list of HTML element into an HTML DOM. %} INSTRUCTION html.id html.dom STR : id -> html.element ? %{ auto dom = ARGV_PLUGIN(0,html,dom); auto id = ARGV_VALUE(1,string); auto element = dom->_dom->id(std::string(id.string,id.size)); if(not element) { return NEW_NULL_PLUGIN(html,element); } type_element *t = new type_element(); t->_element = element; return NEW_PLUGIN(html,element,t); %} help: %{ This instruction returns the first HTML element within the DOM having the "id" attribute set to the parameter. When not found, the null HTML element is returned. %} INSTRUCTION html.class html.dom STR : class -> PTR %{ auto dom = ARGV_PLUGIN(0,html,dom); auto class_name = ARGV_VALUE(1,string); auto elements = dom->_dom->in_class(std::string(class_name.string,class_name.size)); SVM_Memory_Zone zone = ::svm_memory_zone_new(svm); ::svm_memory_zone_append_external__raw(svm,zone,CONST_PEP(html,element),elements.size()); SVM_Value_Pointer p = ::svm_memory_allocate(svm,CURRENT(kernel),zone); SVM_Address a = ::svm_value_pointer_get_address(svm,p); for(auto& e:elements) { type_element *t = new type_element(); t->_element = e; ::svm_memory_write_address(svm,CURRENT(kernel),a++,NEW_PLUGIN(html,element,t)); } return p; %} help: %{ This instruction returns a pointer on an array of HTML elements within the DOM having the "class" attribute containing the parameter. %}