import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Example {
public static void main(String[] args) {
final String regex = "(((<(?P<tag>(title|table|li|p|h\\d|td|li|pre|div|i|a|b|strong))\\b[^>]*?>(((?!<(?P=tag)|<pre|<figure|<img|<h\\d|<li\\b)[\\d\\D])){3,}?)(<(\\/(?P=tag)|figure|img)[^>]*?>))|((<(?P<tag2>(title|li|p|h\\d|td|li|pre|div|i|a|b|strong))\\b[^>]*?>(((?!<(?P=tag2)|<pre|<figure|<img|<h\\d|<li\\b)[\\d\\D])){3,}?)(?=<))|((?<=>)[^<]{3,}(?=<\\/div>)))";
final String string = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+ "<!DOCTYPE html>\n"
+ "<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n"
+ "<head>\n"
+ "<title>Python Text Mining: Perform Text Processing, Word Embedding, Text Classification and Machine Translation</title>\n"
+ "<meta http-equiv=\"default-style\" content=\"application/xhtml+xml; charset=utf-8\" />\n"
+ "<style>ol{list-style-type:none;}a{text-decoration:none;}</style>\n"
+ "</head>\n"
+ "<body>\n"
+ "<nav id=\"toc\" epub:type=\"toc\">\n"
+ "<h1>Table of Contents</h1>\n"
+ "<div>jkas<div>fbksafbjksa</div>dfdfdfd</div>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"cvi.xhtml#cvi\">Cover Page</a></li>\n"
+ "<li><a href=\"tp.xhtml#s1\">Title \n"
+ "Page</a></li>\n"
+ "<li><a href=\"cop.xhtml\">Copyright Page</a></li>\n"
+ "<li><a href=\"ded.xhtml\">Dedication Page</a></li>\n"
+ "<li><a href=\"ata.xhtml\">About the Author</a></li>\n"
+ "<li><a href=\"fm.xhtml\">About the Reviewer</a></li>\n"
+ "<li><a href=\"ack.xhtml\">Acknowledgement</a></li>\n"
+ "<li><a href=\"pre.xhtml\">Preface</a></li>\n"
+ "<li><a href=\"fm1.xhtml\">Errata</a></li>\n"
+ "<li><a href=\"toc.xhtml\">Table of Contents</a></li>\n"
+ "<li><a href=\"c01.xhtml\">1. Basic Text Processing Techniques</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c01.xhtml#s1\">Introduction</a></li>\n"
+ "<li><a href=\"c01.xhtml#s2\">Structure</a></li>\n"
+ "<li><a href=\"c01.xhtml#s3\">Objectives</a></li>\n"
+ "<li><a href=\"c01.xhtml#s4\">Data preparation</a></li>\n"
+ "<li><a href=\"c01.xhtml#s5\">Project 1: Twitter data analysis</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c01.xhtml#s6\">Scraping the data</a></li>\n"
+ "<li><a href=\"c01.xhtml#s7\">Data pre-processing</a></li>\n"
+ "<li><a href=\"c01.xhtml#s8\">Importing necessary packages</a></li>\n"
+ "<li><a href=\"c01.xhtml#s9\">HTML parsing</a></li>\n"
+ "<li><a href=\"c01.xhtml#s10\">Removing accented characters</a></li>\n"
+ "<li><a href=\"c01.xhtml#s11\">Expanding contractions</a></li>\n"
+ "<li><a href=\"c01.xhtml#s12\">Lemmetization and stemming</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c01.xhtml#s13\">Fail case</a></li></ol></li>\n"
+ "<li><a href=\"c01.xhtml#s14\">Removing special characters</a></li>\n"
+ "<li><a href=\"c01.xhtml#s15\">Removing stop words</a></li>\n"
+ "<li><a href=\"c01.xhtml#s16\">Handling emojis or emoticons</a></li>\n"
+ "<li><a href=\"c01.xhtml#s17\">Emoji removal</a></li>\n"
+ "<li><a href=\"c01.xhtml#s18\">Text acronym abbreviation</a></li>\n"
+ "<li><a href=\"c01.xhtml#s19\">Twitter data processing</a></li>\n"
+ "<li><a href=\"c01.xhtml#s20\">Extracting usertags and hashtags</a></li></ol></li>\n"
+ "<li><a href=\"c01.xhtml#s21\">Project 2: In-shots data pre-processing</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c01.xhtml#s22\">Importing the necessary packages</a></li>\n"
+ "<li><a href=\"c01.xhtml#s23\">Setting the urls for data extraction</a></li>\n"
+ "<li><a href=\"c01.xhtml#s24\">Function to scrape data from the urls</a></li>\n"
+ "<li><a href=\"c01.xhtml#s25\">Importing packages</a></li></ol></li>\n"
+ "<li><a href=\"c01.xhtml#s26\">Conclusion</a></li>\n"
+ "<li><a href=\"c01.xhtml#s27\">Questions</a></li>\n"
+ "<li><a href=\"c01.xhtml#s28\">Multiple choice questions</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c01.xhtml#s29\">Answers</a></li></ol></li></ol></li>\n"
+ "<li><a href=\"c02.xhtml\">2. Text to Numbers</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c02.xhtml#s30\">Introduction</a></li>\n"
+ "<li><a href=\"c02.xhtml#s31\">Structure</a></li>\n"
+ "<li><a href=\"c02.xhtml#s32\">Objectives</a></li>\n"
+ "<li><a href=\"c02.xhtml#s33\">Feature encoding or engineering</a></li>\n"
+ "<li><a href=\"c02.xhtml#s34\">One-hot encoding</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c02.xhtml#s35\">Corpus</a></li>\n"
+ "<li><a href=\"c02.xhtml#s36\">Code</a></li>\n"
+ "<li><a href=\"c02.xhtml#s37\">Creating the text corpus</a></li>\n"
+ "<li><a href=\"c02.xhtml#s38\">Some basic pre-processings</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c02.xhtml#s39\">Min_df</a></li>\n"
+ "<li><a href=\"c02.xhtml#s40\">Max_df</a></li></ol></li>\n"
+ "<li><a href=\"c02.xhtml#s41\">Limitations</a></li></ol></li>\n"
+ "<li><a href=\"c02.xhtml#s42\">Bag of words</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c02.xhtml#s43\">Code</a></li>\n"
+ "<li><a href=\"c02.xhtml#s44\">Performing bag-of-words using sklearn</a></li>\n"
+ "<li><a href=\"c02.xhtml#s45\">Difference between one-hot encoding and bag of words</a></li>\n"
+ "<li><a href=\"c02.xhtml#s46\">Limitations</a></li></ol></li>\n"
+ "<li><a href=\"c02.xhtml#s47\">N-gram model</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c02.xhtml#s48\">Limitations</a></li></ol></li>\n"
+ "<li><a href=\"c02.xhtml#s49\">TF-IDF</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c02.xhtml#s50\">Code</a></li>\n"
+ "<li><a href=\"c02.xhtml#s51\">Performing TF-IDF using sklearn</a></li></ol></li>\n"
+ "<li><a href=\"c02.xhtml#s52\">Project -1</a></li>\n"
+ "<li><a href=\"c02.xhtml#s53\">Solution</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c02.xhtml#s54\">Loading the dataset</a></li>\n"
+ "<li><a href=\"c02.xhtml#s55\">Some basic pre-processings</a></li>\n"
+ "<li><a href=\"c02.xhtml#s56\">One-hot encoding</a></li>\n"
+ "<li><a href=\"c02.xhtml#s57\">Bag of words</a></li>\n"
+ "<li><a href=\"c02.xhtml#s58\">Bag of N-grams model</a></li></ol></li>\n"
+ "<li><a href=\"c02.xhtml#s59\">Project -2</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c02.xhtml#s60\">Loading the dataset</a></li>\n"
+ "<li><a href=\"c02.xhtml#s61\">Some basic pre-processings</a></li>\n"
+ "<li><a href=\"c02.xhtml#s62\">TF-IDF</a></li>\n"
+ "<li><a href=\"c02.xhtml#s63\">Comparison of One-Hot, BOW, and TF-IDF</a></li></ol></li>\n"
+ "<li><a href=\"c02.xhtml#s64\">Conclusion</a></li>\n"
+ "<li><a href=\"c02.xhtml#s65\">Questions</a></li>\n"
+ "<li><a href=\"c02.xhtml#s66\">Multiple choice questions</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c02.xhtml#s67\">Answers</a></li></ol></li></ol></li>\n"
+ "<li><a href=\"c03.xhtml\">3. Word Embeddings</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c03.xhtml#s68\">Introduction</a></li>\n"
+ "<li><a href=\"c03.xhtml#s69\">Structure</a></li>\n"
+ "<li><a href=\"c03.xhtml#s70\">Objective</a></li>\n"
+ "<li><a href=\"c03.xhtml#s71\">Word vectors or word embeddings</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c03.xhtml#s72\">Difference between word embeddings and TF-IDF</a></li></ol></li>\n"
+ "<li><a href=\"c03.xhtml#s73\">Feature engineering with word embeddings</a></li>\n"
+ "<li><a href=\"c03.xhtml#s74\">Word2Vec</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c03.xhtml#s75\">Code</a></li>\n"
+ "<li><a href=\"c03.xhtml#s76\">t-SNE</a></li>\n"
+ "<li><a href=\"c03.xhtml#s77\">Word similarity dataframe</a></li></ol></li>\n"
+ "<li><a href=\"c03.xhtml#s78\">Global Vector (GloVe) Model</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c03.xhtml#s79\">The GloVe Model using Spacy</a></li>\n"
+ "<li><a href=\"c03.xhtml#s80\">Loading the downloaded vector model</a></li>\n"
+ "<li><a href=\"c03.xhtml#s81\">Word vector dataframe</a></li>\n"
+ "<li><a href=\"c03.xhtml#s82\">t-SNE visualization</a></li>\n"
+ "<li><a href=\"c03.xhtml#s83\">Word similarity dataframe</a></li></ol></li>\n"
+ "<li><a href=\"c03.xhtml#s84\">fastText</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c03.xhtml#s85\">fastText using Gensim</a></li>\n"
+ "<li><a href=\"c03.xhtml#s86\">t-SNE visualization</a></li>\n"
+ "<li><a href=\"c03.xhtml#s87\">Finding Odd word out using FastText</a></li></ol></li>\n"
+ "<li><a href=\"c03.xhtml#s88\">Difference between Word2Vec, GloVe, and FastText</a></li>\n"
+ "<li><a href=\"c03.xhtml#s89\">Using pre-trained word embeddings</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c03.xhtml#s90\">Importing necessary libraries</a></li>\n"
+ "<li><a href=\"c03.xhtml#s91\">Loading the Word2Vec model</a></li>\n"
+ "<li><a href=\"c03.xhtml#s92\">Sample data initialization</a></li>\n"
+ "<li><a href=\"c03.xhtml#s93\">Pre-processings and word tokenizations</a></li>\n"
+ "<li><a href=\"c03.xhtml#s94\">Extracting list of unique words</a></li>\n"
+ "<li><a href=\"c03.xhtml#s95\">t-SNE visualization</a></li></ol></li>\n"
+ "<li><a href=\"c03.xhtml#s96\">Project</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c03.xhtml#s97\">Solution</a></li>\n"
+ "<li><a href=\"c03.xhtml#s98\">Importing necessary libraries</a></li>\n"
+ "<li><a href=\"c03.xhtml#s99\">Loading the Word2Vec model</a></li>\n"
+ "<li><a href=\"c03.xhtml#s100\">Scrapping data from inshots</a></li>\n"
+ "<li><a href=\"c03.xhtml#s101\">Pre-processings and word tokenizations</a></li>\n"
+ "<li><a href=\"c03.xhtml#s102\">Extracting list of unique words</a></li>\n"
+ "<li><a href=\"c03.xhtml#s103\">Removing words not in vocab</a></li>\n"
+ "<li><a href=\"c03.xhtml#s104\">t-SNE visualization</a></li></ol></li>\n"
+ "<li><a href=\"c03.xhtml#s105\">Conclusion</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c03.xhtml#s106\">Project</a></li></ol></li></ol></li>\n"
+ "<li><a href=\"c04.xhtml\">4. Topic Modeling</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c04.xhtml#s107\">Introduction</a></li>\n"
+ "<li><a href=\"c04.xhtml#s108\">Structure</a></li>\n"
+ "<li><a href=\"c04.xhtml#s109\">Objectives</a></li>\n"
+ "<li><a href=\"c04.xhtml#s110\">Topic modeling</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c04.xhtml#s111\">Identity a matrix</a></li>\n"
+ "<li><a href=\"c04.xhtml#s112\">Unitary matrix</a></li>\n"
+ "<li><a href=\"c04.xhtml#s113\">Eigen values and Eigen vectors</a></li>\n"
+ "<li><a href=\"c04.xhtml#s114\">Singular value decomposition</a></li>\n"
+ "<li><a href=\"c04.xhtml#s115\">Latent semantic indexing</a></li>\n"
+ "<li><a href=\"c04.xhtml#s116\">TF-IDF vectorization</a></li>\n"
+ "<li><a href=\"c04.xhtml#s117\">Building an SVD model</a></li>\n"
+ "<li><a href=\"c04.xhtml#s118\">Looking at the topics and the words contributing to the topic</a></li>\n"
+ "<li><a href=\"c04.xhtml#s119\">Advantages and disadvantages of LSI</a></li></ol></li>\n"
+ "<li><a href=\"c04.xhtml#s120\">Latent Dirichlet Allocation</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c04.xhtml#s121\">Introduction</a></li>\n"
+ "<li><a href=\"c04.xhtml#s122\">Working</a></li>\n"
+ "<li><a href=\"c04.xhtml#s123\">About the data</a></li>\n"
+ "<li><a href=\"c04.xhtml#s124\">Some pre-processing</a></li>\n"
+ "<li><a href=\"c04.xhtml#s125\">Looking at the top 20 frequently used words</a></li>\n"
+ "<li><a href=\"c04.xhtml#s126\">Some EDA</a></li>\n"
+ "<li><a href=\"c04.xhtml#s127\">Generating Bi-grams (BoW)</a></li>\n"
+ "<li><a href=\"c04.xhtml#s128\">LDA model fitting</a></li>\n"
+ "<li><a href=\"c04.xhtml#s129\">LDA using Gensim and its visualization</a></li>\n"
+ "<li><a href=\"c04.xhtml#s130\">Importing the data</a></li>\n"
+ "<li><a href=\"c04.xhtml#s131\">Some pre-processing</a></li>\n"
+ "<li><a href=\"c04.xhtml#s132\">Extending stop words and building ngram models</a></li>\n"
+ "<li><a href=\"c04.xhtml#s133\">Creating term document frequency and the LDA model</a></li>\n"
+ "<li><a href=\"c04.xhtml#s134\">Dominant topic identification</a></li>\n"
+ "<li><a href=\"c04.xhtml#s135\">PyLDAvis</a></li>\n"
+ "<li><a href=\"c04.xhtml#s136\">Disadvantages of LDA</a></li></ol></li>\n"
+ "<li><a href=\"c04.xhtml#s137\">Non-Negative Matrix Factorization (NMF)</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c04.xhtml#s138\">Importing necessary libraries</a></li>\n"
+ "<li><a href=\"c04.xhtml#s139\">Some pre-processing</a></li>\n"
+ "<li><a href=\"c04.xhtml#s140\">Looking at the top 20 frequently used words</a></li>\n"
+ "<li><a href=\"c04.xhtml#s141\">Some EDA</a></li>\n"
+ "<li><a href=\"c04.xhtml#s142\">Generating Bi-grams (BoW)</a></li>\n"
+ "<li><a href=\"c04.xhtml#s143\">Building TF-IDF vectorizer</a></li>\n"
+ "<li><a href=\"c04.xhtml#s144\">Visualizing ranks with the TF-IDF weights</a></li>\n"
+ "<li><a href=\"c04.xhtml#s145\">NMF modelling</a></li>\n"
+ "<li><a href=\"c04.xhtml#s146\">Disadvantages of NMF</a></li></ol></li>\n"
+ "<li><a href=\"c04.xhtml#s147\">Conclusion</a></li>\n"
+ "<li><a href=\"c04.xhtml#s148\">Questions</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c04.xhtml#s149\">Answers</a></li></ol></li>\n"
+ "<li><a href=\"c04.xhtml#s150\">Projects</a></li></ol></li>\n"
+ "<li><a href=\"c05.xhtml\">5. Unsupervised Sentiment Classification</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c05.xhtml#s151\">Introduction</a></li>\n"
+ "<li><a href=\"c05.xhtml#s152\">Structure</a></li>\n"
+ "<li><a href=\"c05.xhtml#s153\">Objective</a></li>\n"
+ "<li><a href=\"c05.xhtml#s154\">Lexicon-based approach</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c05.xhtml#s155\">About the dataset</a></li>\n"
+ "<li><a href=\"c05.xhtml#s156\">Loading necessary libraries</a></li>\n"
+ "<li><a href=\"c05.xhtml#s157\">Importing the dataset</a></li>\n"
+ "<li><a href=\"c05.xhtml#s158\">Some pre-processings</a></li>\n"
+ "<li><a href=\"c05.xhtml#s159\">Defining a function to perform the following</a></li></ol></li>\n"
+ "<li><a href=\"c05.xhtml#s160\">Opinion lexicon</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c05.xhtml#s161\">Importing the opinion lexicon</a></li>\n"
+ "<li><a href=\"c05.xhtml#s162\">Tokenize the reviews into a sentence and form the sentence and review the ID</a></li>\n"
+ "<li><a href=\"c05.xhtml#s163\">Sentiment classification</a></li>\n"
+ "<li><a href=\"c05.xhtml#s164\">Converting the sentiments to a review level</a></li>\n"
+ "<li><a href=\"c05.xhtml#s165\">Converting the sentiment codes from the dataset to sentiments</a></li></ol></li>\n"
+ "<li><a href=\"c05.xhtml#s166\">Senti WordNet lexicon</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c05.xhtml#s167\">Function to perform SentiWordNet</a></li>\n"
+ "<li><a href=\"c05.xhtml#s168\">Sentiment classification</a></li>\n"
+ "<li><a href=\"c05.xhtml#s169\">Evaluation</a></li></ol></li>\n"
+ "<li><a href=\"c05.xhtml#s170\">TextBlob</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c05.xhtml#s171\">Importing libraries</a></li>\n"
+ "<li><a href=\"c05.xhtml#s172\">Predicting a sentiment of sample reviews</a></li>\n"
+ "<li><a href=\"c05.xhtml#s173\">Prediction and evaluation</a></li></ol></li>\n"
+ "<li><a href=\"c05.xhtml#s174\">AFINN</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c05.xhtml#s175\">Importing necessary libraries</a></li>\n"
+ "<li><a href=\"c05.xhtml#s176\">Sentiment classification and evaluation</a></li></ol></li>\n"
+ "<li><a href=\"c05.xhtml#s177\">VADER</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c05.xhtml#s178\">Importing necessary libraries</a></li>\n"
+ "<li><a href=\"c05.xhtml#s179\">Sentiment classification and evaluation</a></li>\n"
+ "<li><a href=\"c05.xhtml#s180\">Sample prediction</a></li>\n"
+ "<li><a href=\"c05.xhtml#s181\">Drawbacks of lexicon-based sentiment classification</a></li></ol></li>\n"
+ "<li><a href=\"c05.xhtml#s182\">Conclusion</a></li>\n"
+ "<li><a href=\"c05.xhtml#s183\">Questions</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c05.xhtml#s184\">Answers</a></li></ol></li></ol></li>\n"
+ "<li><a href=\"c06.xhtml\">6. Text Classification Using ML</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c06.xhtml#s185\">Introduction</a></li>\n"
+ "<li><a href=\"c06.xhtml#s186\">Structure</a></li>\n"
+ "<li><a href=\"c06.xhtml#s187\">Objectives</a></li>\n"
+ "<li><a href=\"c06.xhtml#s188\">Supervised learning</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c06.xhtml#s189\">About the dataset</a></li>\n"
+ "<li><a href=\"c06.xhtml#s190\">Loading the necessary libraries</a></li>\n"
+ "<li><a href=\"c06.xhtml#s191\">Importing the dataset</a></li>\n"
+ "<li><a href=\"c06.xhtml#s192\">Pre-processings</a></li>\n"
+ "<li><a href=\"c06.xhtml#s193\">Performing TF-IDF</a></li></ol></li>\n"
+ "<li><a href=\"c06.xhtml#s194\">Model fitting</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c06.xhtml#s195\">Logistic regression</a></li>\n"
+ "<li><a href=\"c06.xhtml#s196\">Lasso regularization</a></li>\n"
+ "<li><a href=\"c06.xhtml#s197\">Ridge regularization</a></li>\n"
+ "<li><a href=\"c06.xhtml#s198\">Elastic-net classifier</a></li>\n"
+ "<li><a href=\"c06.xhtml#s199\">Naïve Bayes algorithm</a></li>\n"
+ "<li><a href=\"c06.xhtml#s200\">K – Nearest Neighbors</a></li>\n"
+ "<li><a href=\"c06.xhtml#s201\">Decision tree</a></li>\n"
+ "<li><a href=\"c06.xhtml#s202\">Random forest</a></li>\n"
+ "<li><a href=\"c06.xhtml#s203\">Ada Boost</a></li>\n"
+ "<li><a href=\"c06.xhtml#s204\">Gradient boosting machine</a></li>\n"
+ "<li><a href=\"c06.xhtml#s205\">XG-Boost</a></li></ol></li>\n"
+ "<li><a href=\"c06.xhtml#s206\">Grid Search</a></li>\n"
+ "<li><a href=\"c06.xhtml#s207\">Conclusion</a></li>\n"
+ "<li><a href=\"c06.xhtml#s208\">Questions</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c06.xhtml#s209\">Answers</a></li></ol></li>\n"
+ "<li><a href=\"c06.xhtml#s210\">Project</a></li></ol></li>\n"
+ "<li><a href=\"c07.xhtml\">7. Text Classification Using Deep Learning</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c07.xhtml#s211\">Introduction</a></li>\n"
+ "<li><a href=\"c07.xhtml#s212\">Structure</a></li>\n"
+ "<li><a href=\"c07.xhtml#s213\">Objectives</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c07.xhtml#s214\">Learning about the Neural Networks</a></li></ol></li>\n"
+ "<li><a href=\"c07.xhtml#s215\">Neural networks for sentiment classification</a></li>\n"
+ "<li><a href=\"c07.xhtml#s216\">Neural networks with TF-IDF</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c07.xhtml#s217\">Installing libraries</a></li>\n"
+ "<li><a href=\"c07.xhtml#s218\">Importing libraries</a></li>\n"
+ "<li><a href=\"c07.xhtml#s219\">Importing the dataset</a></li>\n"
+ "<li><a href=\"c07.xhtml#s220\">Pre-processings</a></li>\n"
+ "<li><a href=\"c07.xhtml#s221\">Train, test, and validation set</a></li>\n"
+ "<li><a href=\"c07.xhtml#s222\">Performing TF-IDF</a></li>\n"
+ "<li><a href=\"c07.xhtml#s223\">Model building</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c07.xhtml#s224\">Linear regression</a></li>\n"
+ "<li><a href=\"c07.xhtml#s225\">Increasing the dimensionality</a></li></ol></li>\n"
+ "<li><a href=\"c07.xhtml#s226\">Activation functions</a></li>\n"
+ "<li><a href=\"c07.xhtml#s227\">Model fitting</a></li>\n"
+ "<li><a href=\"c07.xhtml#s228\">Cross – validation</a></li></ol></li>\n"
+ "<li><a href=\"c07.xhtml#s229\">Neural networks with word2vec:</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c07.xhtml#s230\">Data splitting</a></li>\n"
+ "<li><a href=\"c07.xhtml#s231\">Creating a Word2Vec model</a></li>\n"
+ "<li><a href=\"c07.xhtml#s232\">Word2Vec model fitting</a></li>\n"
+ "<li><a href=\"c07.xhtml#s233\">Creating word vectors</a></li>\n"
+ "<li><a href=\"c07.xhtml#s234\">Padding sequences</a></li>\n"
+ "<li><a href=\"c07.xhtml#s235\">ANN model building</a></li>\n"
+ "<li><a href=\"c07.xhtml#s236\">Model fitting</a></li>\n"
+ "<li><a href=\"c07.xhtml#s237\">Cross-validation</a></li>\n"
+ "<li><a href=\"c07.xhtml#s238\">Sentiment analysis using LSTM</a></li>\n"
+ "<li><a href=\"c07.xhtml#s239\">Importing the dataset</a></li>\n"
+ "<li><a href=\"c07.xhtml#s240\">Pre-processings</a></li>\n"
+ "<li><a href=\"c07.xhtml#s241\">Data splitting and padding</a></li>\n"
+ "<li><a href=\"c07.xhtml#s242\">LSTM model building</a></li>\n"
+ "<li><a href=\"c07.xhtml#s243\">Cross-validation</a></li>\n"
+ "<li><a href=\"c07.xhtml#s244\">Comparison of results</a></li></ol></li>\n"
+ "<li><a href=\"c07.xhtml#s245\">Conclusion</a></li>\n"
+ "<li><a href=\"c07.xhtml#s246\">Questions</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c07.xhtml#s247\">Answers</a></li></ol></li></ol></li>\n"
+ "<li><a href=\"c08.xhtml\">8. Recommendation Engine</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c08.xhtml#s248\">Introduction</a></li>\n"
+ "<li><a href=\"c08.xhtml#s249\">Structure</a></li>\n"
+ "<li><a href=\"c08.xhtml#s250\">Objective</a></li>\n"
+ "<li><a href=\"c08.xhtml#s251\">Applications</a></li>\n"
+ "<li><a href=\"c08.xhtml#s252\">Classification of a recommendation system</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c08.xhtml#s253\">Simple rule-based recommenders</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c08.xhtml#s254\">About the dataset</a></li>\n"
+ "<li><a href=\"c08.xhtml#s255\">Installing and loading necessary libraries</a></li>\n"
+ "<li><a href=\"c08.xhtml#s256\">Importing the dataset</a></li>\n"
+ "<li><a href=\"c08.xhtml#s257\">Building a simple rule-based recommendation system</a></li>\n"
+ "<li><a href=\"c08.xhtml#s258\">Weighted ratings calculation</a></li>\n"
+ "<li><a href=\"c08.xhtml#s259\">Applying the calculation on the filtered records</a></li></ol></li>\n"
+ "<li><a href=\"c08.xhtml#s260\">Content based</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c08.xhtml#s261\">Using document similarity</a></li>\n"
+ "<li><a href=\"c08.xhtml#s262\">About the dataset</a></li>\n"
+ "<li><a href=\"c08.xhtml#s263\">Installing and loading necessary libraries</a></li>\n"
+ "<li><a href=\"c08.xhtml#s264\">Importing the dataset</a></li>\n"
+ "<li><a href=\"c08.xhtml#s265\">Some pre-processing</a></li></ol></li></ol></li>\n"
+ "<li><a href=\"c08.xhtml#s266\">Extract TF-IDF features</a></li>\n"
+ "<li><a href=\"c08.xhtml#s267\">Computing pairwise document similarity</a></li>\n"
+ "<li><a href=\"c08.xhtml#s268\">Building a movie recommender</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c08.xhtml#s269\">Using word embedding</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c08.xhtml#s270\">FastText</a></li></ol></li>\n"
+ "<li><a href=\"c08.xhtml#s271\">Generate document-level embeddings</a></li>\n"
+ "<li><a href=\"c08.xhtml#s272\">Collaborative-based</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c08.xhtml#s273\">User-based</a></li>\n"
+ "<li><a href=\"c08.xhtml#s274\">About the dataset</a></li>\n"
+ "<li><a href=\"c08.xhtml#s275\">Installing and loading necessary libraries</a></li>\n"
+ "<li><a href=\"c08.xhtml#s276\">Importing the dataset</a></li></ol></li></ol></li>\n"
+ "<li><a href=\"c08.xhtml#s277\">Advantages of a recommendation system</a></li>\n"
+ "<li><a href=\"c08.xhtml#s278\">Conclusion</a></li>\n"
+ "<li><a href=\"c08.xhtml#s279\">Questions</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c08.xhtml#s280\">Answers</a></li></ol></li></ol></li>\n"
+ "<li><a href=\"c09.xhtml\">9. Machine Translation</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c09.xhtml#s281\">Introduction</a></li>\n"
+ "<li><a href=\"c09.xhtml#s282\">Structure</a></li>\n"
+ "<li><a href=\"c09.xhtml#s283\">Objectives</a></li>\n"
+ "<li><a href=\"c09.xhtml#s284\">Application</a></li>\n"
+ "<li><a href=\"c09.xhtml#s285\">Types of MT</a></li>\n"
+ "<li><a href=\"c09.xhtml#s286\">Readily available libraries</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c09.xhtml#s287\">TextBlob</a></li>\n"
+ "<li><a href=\"c09.xhtml#s288\">LangDetect</a></li>\n"
+ "<li><a href=\"c09.xhtml#s289\">Fasttext</a></li></ol></li>\n"
+ "<li><a href=\"c09.xhtml#s290\">Sequence-to-sequence modeling</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c09.xhtml#s291\">About the dataset</a></li>\n"
+ "<li><a href=\"c09.xhtml#s292\">Installing and loading necessary libraries:</a></li>\n"
+ "<li><a href=\"c09.xhtml#s293\">Importing the dataset</a></li>\n"
+ "<li><a href=\"c09.xhtml#s294\">Preprocessing</a></li></ol></li>\n"
+ "<li><a href=\"c09.xhtml#s295\">Model building (using LSTM)</a></li>\n"
+ "<li><a href=\"c09.xhtml#s296\">Conclusion</a></li>\n"
+ "<li><a href=\"c09.xhtml#s297\">Exercise</a></li>\n"
+ "<li><a href=\"c09.xhtml#s298\">Questions</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c09.xhtml#s299\">Answers</a></li></ol></li></ol></li>\n"
+ "<li><a href=\"c10.xhtml\">10. Transfer Learning</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c10.xhtml#s300\">Introduction</a></li>\n"
+ "<li><a href=\"c10.xhtml#s301\">Structure</a></li>\n"
+ "<li><a href=\"c10.xhtml#s302\">Objectives</a></li>\n"
+ "<li><a href=\"c10.xhtml#s303\">Universal Sentence Encoder</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c10.xhtml#s304\">Goal</a></li></ol></li>\n"
+ "<li><a href=\"c10.xhtml#s305\">What is a transformer and do we need it?</a></li>\n"
+ "<li><a href=\"c10.xhtml#s306\">Deep Averaging Network (DAN)</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c10.xhtml#s307\">About the data</a></li>\n"
+ "<li><a href=\"c10.xhtml#s308\">Data pre-processing</a></li></ol></li>\n"
+ "<li><a href=\"c10.xhtml#s309\">Bidirectional Encoder Representation from Transformer (BERT)</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c10.xhtml#s310\">What is the necessity of BERT?</a></li>\n"
+ "<li><a href=\"c10.xhtml#s311\">The main idea behind BERT</a></li>\n"
+ "<li><a href=\"c10.xhtml#s312\">Why is BERT so powerful?</a></li>\n"
+ "<li><a href=\"c10.xhtml#s313\">BERT architecture</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c10.xhtml#s314\">Text processing</a></li>\n"
+ "<li><a href=\"c10.xhtml#s315\">Pre-training tasks</a></li></ol></li></ol></li>\n"
+ "<li><a href=\"c10.xhtml#s316\">Fine tuning</a></li>\n"
+ "<li><a href=\"c10.xhtml#s317\">Drawbacks</a></li>\n"
+ "<li><a href=\"c10.xhtml#s318\">Conclusion</a></li>\n"
+ "<li><a href=\"c10.xhtml#s319\">Multiple choice questions</a>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a href=\"c10.xhtml#s320\">Answers</a></li></ol></li>\n"
+ "<li><a href=\"c10.xhtml#s321\">Project</a></li></ol></li>\n"
+ "<li><a href=\"ind.xhtml\">Index</a></li>\n"
+ "</ol>\n"
+ "</nav>\n"
+ "<nav epub:type=\"landmarks\">\n"
+ "<h3>Guide</h3>\n"
+ "<ol epub:type=\"list\">\n"
+ "<li><a epub:type=\"titlepage\" href=\"tp.xhtml\">Title Page</a></li>\n"
+ "<li><a epub:type=\"copyright-page\" href=\"cop.xhtml\">Copyright Page</a></li>\n"
+ "<li><a epub:type=\"toc\" href=\"toc.xhtml\">Table of Contents</a></li>\n"
+ "<li><a epub:type=\"bodymatter\" href=\"c01.xhtml\">1. Basic Text Processing Techniques</a></li>\n"
+ "</ol>\n"
+ "</nav>\n"
+ "</body>\n"
+ "</html>";
final Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
final Matcher matcher = pattern.matcher(string);
while (matcher.find()) {
System.out.println("Full match: " + matcher.group(0));
for (int i = 1; i <= matcher.groupCount(); i++) {
System.out.println("Group " + i + ": " + matcher.group(i));
}
}
}
}
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Java, please visit: https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html