# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"(((<(?P<tag>(title|table|li|p|h\d|td|li|pre|div|i|a|b|strong))\b[^>]*?>(((?!<(?P=tag)|<pre|<figure|<img|<h\d|<li\b)[\d\D])){3,}?)(<(\/(?P=tag)|figure|img)[^>]*?>))|((<(?P<tag2>(title|li|p|h\d|td|li|pre|div|i|a|b|strong))\b[^>]*?>(((?!<(?P=tag2)|<pre|<figure|<img|<h\d|<li\b)[\d\D])){3,}?)(?=<))|((?<=>)[^<]{3,}(?=<\/div>)))"
test_str = ("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<!DOCTYPE html>\n"
"<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n"
"<head>\n"
"<title>Python Text Mining: Perform Text Processing, Word Embedding, Text Classification and Machine Translation</title>\n"
"<meta http-equiv=\"default-style\" content=\"application/xhtml+xml; charset=utf-8\" />\n"
"<style>ol{list-style-type:none;}a{text-decoration:none;}</style>\n"
"</head>\n"
"<body>\n"
"<nav id=\"toc\" epub:type=\"toc\">\n"
"<h1>Table of Contents</h1>\n"
"<div>jkas<div>fbksafbjksa</div>dfdfdfd</div>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"cvi.xhtml#cvi\">Cover Page</a></li>\n"
"<li><a href=\"tp.xhtml#s1\">Title \n"
"Page</a></li>\n"
"<li><a href=\"cop.xhtml\">Copyright Page</a></li>\n"
"<li><a href=\"ded.xhtml\">Dedication Page</a></li>\n"
"<li><a href=\"ata.xhtml\">About the Author</a></li>\n"
"<li><a href=\"fm.xhtml\">About the Reviewer</a></li>\n"
"<li><a href=\"ack.xhtml\">Acknowledgement</a></li>\n"
"<li><a href=\"pre.xhtml\">Preface</a></li>\n"
"<li><a href=\"fm1.xhtml\">Errata</a></li>\n"
"<li><a href=\"toc.xhtml\">Table of Contents</a></li>\n"
"<li><a href=\"c01.xhtml\">1. Basic Text Processing Techniques</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c01.xhtml#s1\">Introduction</a></li>\n"
"<li><a href=\"c01.xhtml#s2\">Structure</a></li>\n"
"<li><a href=\"c01.xhtml#s3\">Objectives</a></li>\n"
"<li><a href=\"c01.xhtml#s4\">Data preparation</a></li>\n"
"<li><a href=\"c01.xhtml#s5\">Project 1: Twitter data analysis</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c01.xhtml#s6\">Scraping the data</a></li>\n"
"<li><a href=\"c01.xhtml#s7\">Data pre-processing</a></li>\n"
"<li><a href=\"c01.xhtml#s8\">Importing necessary packages</a></li>\n"
"<li><a href=\"c01.xhtml#s9\">HTML parsing</a></li>\n"
"<li><a href=\"c01.xhtml#s10\">Removing accented characters</a></li>\n"
"<li><a href=\"c01.xhtml#s11\">Expanding contractions</a></li>\n"
"<li><a href=\"c01.xhtml#s12\">Lemmetization and stemming</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c01.xhtml#s13\">Fail case</a></li></ol></li>\n"
"<li><a href=\"c01.xhtml#s14\">Removing special characters</a></li>\n"
"<li><a href=\"c01.xhtml#s15\">Removing stop words</a></li>\n"
"<li><a href=\"c01.xhtml#s16\">Handling emojis or emoticons</a></li>\n"
"<li><a href=\"c01.xhtml#s17\">Emoji removal</a></li>\n"
"<li><a href=\"c01.xhtml#s18\">Text acronym abbreviation</a></li>\n"
"<li><a href=\"c01.xhtml#s19\">Twitter data processing</a></li>\n"
"<li><a href=\"c01.xhtml#s20\">Extracting usertags and hashtags</a></li></ol></li>\n"
"<li><a href=\"c01.xhtml#s21\">Project 2: In-shots data pre-processing</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c01.xhtml#s22\">Importing the necessary packages</a></li>\n"
"<li><a href=\"c01.xhtml#s23\">Setting the urls for data extraction</a></li>\n"
"<li><a href=\"c01.xhtml#s24\">Function to scrape data from the urls</a></li>\n"
"<li><a href=\"c01.xhtml#s25\">Importing packages</a></li></ol></li>\n"
"<li><a href=\"c01.xhtml#s26\">Conclusion</a></li>\n"
"<li><a href=\"c01.xhtml#s27\">Questions</a></li>\n"
"<li><a href=\"c01.xhtml#s28\">Multiple choice questions</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c01.xhtml#s29\">Answers</a></li></ol></li></ol></li>\n"
"<li><a href=\"c02.xhtml\">2. Text to Numbers</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c02.xhtml#s30\">Introduction</a></li>\n"
"<li><a href=\"c02.xhtml#s31\">Structure</a></li>\n"
"<li><a href=\"c02.xhtml#s32\">Objectives</a></li>\n"
"<li><a href=\"c02.xhtml#s33\">Feature encoding or engineering</a></li>\n"
"<li><a href=\"c02.xhtml#s34\">One-hot encoding</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c02.xhtml#s35\">Corpus</a></li>\n"
"<li><a href=\"c02.xhtml#s36\">Code</a></li>\n"
"<li><a href=\"c02.xhtml#s37\">Creating the text corpus</a></li>\n"
"<li><a href=\"c02.xhtml#s38\">Some basic pre-processings</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c02.xhtml#s39\">Min_df</a></li>\n"
"<li><a href=\"c02.xhtml#s40\">Max_df</a></li></ol></li>\n"
"<li><a href=\"c02.xhtml#s41\">Limitations</a></li></ol></li>\n"
"<li><a href=\"c02.xhtml#s42\">Bag of words</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c02.xhtml#s43\">Code</a></li>\n"
"<li><a href=\"c02.xhtml#s44\">Performing bag-of-words using sklearn</a></li>\n"
"<li><a href=\"c02.xhtml#s45\">Difference between one-hot encoding and bag of words</a></li>\n"
"<li><a href=\"c02.xhtml#s46\">Limitations</a></li></ol></li>\n"
"<li><a href=\"c02.xhtml#s47\">N-gram model</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c02.xhtml#s48\">Limitations</a></li></ol></li>\n"
"<li><a href=\"c02.xhtml#s49\">TF-IDF</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c02.xhtml#s50\">Code</a></li>\n"
"<li><a href=\"c02.xhtml#s51\">Performing TF-IDF using sklearn</a></li></ol></li>\n"
"<li><a href=\"c02.xhtml#s52\">Project -1</a></li>\n"
"<li><a href=\"c02.xhtml#s53\">Solution</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c02.xhtml#s54\">Loading the dataset</a></li>\n"
"<li><a href=\"c02.xhtml#s55\">Some basic pre-processings</a></li>\n"
"<li><a href=\"c02.xhtml#s56\">One-hot encoding</a></li>\n"
"<li><a href=\"c02.xhtml#s57\">Bag of words</a></li>\n"
"<li><a href=\"c02.xhtml#s58\">Bag of N-grams model</a></li></ol></li>\n"
"<li><a href=\"c02.xhtml#s59\">Project -2</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c02.xhtml#s60\">Loading the dataset</a></li>\n"
"<li><a href=\"c02.xhtml#s61\">Some basic pre-processings</a></li>\n"
"<li><a href=\"c02.xhtml#s62\">TF-IDF</a></li>\n"
"<li><a href=\"c02.xhtml#s63\">Comparison of One-Hot, BOW, and TF-IDF</a></li></ol></li>\n"
"<li><a href=\"c02.xhtml#s64\">Conclusion</a></li>\n"
"<li><a href=\"c02.xhtml#s65\">Questions</a></li>\n"
"<li><a href=\"c02.xhtml#s66\">Multiple choice questions</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c02.xhtml#s67\">Answers</a></li></ol></li></ol></li>\n"
"<li><a href=\"c03.xhtml\">3. Word Embeddings</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c03.xhtml#s68\">Introduction</a></li>\n"
"<li><a href=\"c03.xhtml#s69\">Structure</a></li>\n"
"<li><a href=\"c03.xhtml#s70\">Objective</a></li>\n"
"<li><a href=\"c03.xhtml#s71\">Word vectors or word embeddings</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c03.xhtml#s72\">Difference between word embeddings and TF-IDF</a></li></ol></li>\n"
"<li><a href=\"c03.xhtml#s73\">Feature engineering with word embeddings</a></li>\n"
"<li><a href=\"c03.xhtml#s74\">Word2Vec</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c03.xhtml#s75\">Code</a></li>\n"
"<li><a href=\"c03.xhtml#s76\">t-SNE</a></li>\n"
"<li><a href=\"c03.xhtml#s77\">Word similarity dataframe</a></li></ol></li>\n"
"<li><a href=\"c03.xhtml#s78\">Global Vector (GloVe) Model</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c03.xhtml#s79\">The GloVe Model using Spacy</a></li>\n"
"<li><a href=\"c03.xhtml#s80\">Loading the downloaded vector model</a></li>\n"
"<li><a href=\"c03.xhtml#s81\">Word vector dataframe</a></li>\n"
"<li><a href=\"c03.xhtml#s82\">t-SNE visualization</a></li>\n"
"<li><a href=\"c03.xhtml#s83\">Word similarity dataframe</a></li></ol></li>\n"
"<li><a href=\"c03.xhtml#s84\">fastText</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c03.xhtml#s85\">fastText using Gensim</a></li>\n"
"<li><a href=\"c03.xhtml#s86\">t-SNE visualization</a></li>\n"
"<li><a href=\"c03.xhtml#s87\">Finding Odd word out using FastText</a></li></ol></li>\n"
"<li><a href=\"c03.xhtml#s88\">Difference between Word2Vec, GloVe, and FastText</a></li>\n"
"<li><a href=\"c03.xhtml#s89\">Using pre-trained word embeddings</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c03.xhtml#s90\">Importing necessary libraries</a></li>\n"
"<li><a href=\"c03.xhtml#s91\">Loading the Word2Vec model</a></li>\n"
"<li><a href=\"c03.xhtml#s92\">Sample data initialization</a></li>\n"
"<li><a href=\"c03.xhtml#s93\">Pre-processings and word tokenizations</a></li>\n"
"<li><a href=\"c03.xhtml#s94\">Extracting list of unique words</a></li>\n"
"<li><a href=\"c03.xhtml#s95\">t-SNE visualization</a></li></ol></li>\n"
"<li><a href=\"c03.xhtml#s96\">Project</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c03.xhtml#s97\">Solution</a></li>\n"
"<li><a href=\"c03.xhtml#s98\">Importing necessary libraries</a></li>\n"
"<li><a href=\"c03.xhtml#s99\">Loading the Word2Vec model</a></li>\n"
"<li><a href=\"c03.xhtml#s100\">Scrapping data from inshots</a></li>\n"
"<li><a href=\"c03.xhtml#s101\">Pre-processings and word tokenizations</a></li>\n"
"<li><a href=\"c03.xhtml#s102\">Extracting list of unique words</a></li>\n"
"<li><a href=\"c03.xhtml#s103\">Removing words not in vocab</a></li>\n"
"<li><a href=\"c03.xhtml#s104\">t-SNE visualization</a></li></ol></li>\n"
"<li><a href=\"c03.xhtml#s105\">Conclusion</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c03.xhtml#s106\">Project</a></li></ol></li></ol></li>\n"
"<li><a href=\"c04.xhtml\">4. Topic Modeling</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c04.xhtml#s107\">Introduction</a></li>\n"
"<li><a href=\"c04.xhtml#s108\">Structure</a></li>\n"
"<li><a href=\"c04.xhtml#s109\">Objectives</a></li>\n"
"<li><a href=\"c04.xhtml#s110\">Topic modeling</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c04.xhtml#s111\">Identity a matrix</a></li>\n"
"<li><a href=\"c04.xhtml#s112\">Unitary matrix</a></li>\n"
"<li><a href=\"c04.xhtml#s113\">Eigen values and Eigen vectors</a></li>\n"
"<li><a href=\"c04.xhtml#s114\">Singular value decomposition</a></li>\n"
"<li><a href=\"c04.xhtml#s115\">Latent semantic indexing</a></li>\n"
"<li><a href=\"c04.xhtml#s116\">TF-IDF vectorization</a></li>\n"
"<li><a href=\"c04.xhtml#s117\">Building an SVD model</a></li>\n"
"<li><a href=\"c04.xhtml#s118\">Looking at the topics and the words contributing to the topic</a></li>\n"
"<li><a href=\"c04.xhtml#s119\">Advantages and disadvantages of LSI</a></li></ol></li>\n"
"<li><a href=\"c04.xhtml#s120\">Latent Dirichlet Allocation</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c04.xhtml#s121\">Introduction</a></li>\n"
"<li><a href=\"c04.xhtml#s122\">Working</a></li>\n"
"<li><a href=\"c04.xhtml#s123\">About the data</a></li>\n"
"<li><a href=\"c04.xhtml#s124\">Some pre-processing</a></li>\n"
"<li><a href=\"c04.xhtml#s125\">Looking at the top 20 frequently used words</a></li>\n"
"<li><a href=\"c04.xhtml#s126\">Some EDA</a></li>\n"
"<li><a href=\"c04.xhtml#s127\">Generating Bi-grams (BoW)</a></li>\n"
"<li><a href=\"c04.xhtml#s128\">LDA model fitting</a></li>\n"
"<li><a href=\"c04.xhtml#s129\">LDA using Gensim and its visualization</a></li>\n"
"<li><a href=\"c04.xhtml#s130\">Importing the data</a></li>\n"
"<li><a href=\"c04.xhtml#s131\">Some pre-processing</a></li>\n"
"<li><a href=\"c04.xhtml#s132\">Extending stop words and building ngram models</a></li>\n"
"<li><a href=\"c04.xhtml#s133\">Creating term document frequency and the LDA model</a></li>\n"
"<li><a href=\"c04.xhtml#s134\">Dominant topic identification</a></li>\n"
"<li><a href=\"c04.xhtml#s135\">PyLDAvis</a></li>\n"
"<li><a href=\"c04.xhtml#s136\">Disadvantages of LDA</a></li></ol></li>\n"
"<li><a href=\"c04.xhtml#s137\">Non-Negative Matrix Factorization (NMF)</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c04.xhtml#s138\">Importing necessary libraries</a></li>\n"
"<li><a href=\"c04.xhtml#s139\">Some pre-processing</a></li>\n"
"<li><a href=\"c04.xhtml#s140\">Looking at the top 20 frequently used words</a></li>\n"
"<li><a href=\"c04.xhtml#s141\">Some EDA</a></li>\n"
"<li><a href=\"c04.xhtml#s142\">Generating Bi-grams (BoW)</a></li>\n"
"<li><a href=\"c04.xhtml#s143\">Building TF-IDF vectorizer</a></li>\n"
"<li><a href=\"c04.xhtml#s144\">Visualizing ranks with the TF-IDF weights</a></li>\n"
"<li><a href=\"c04.xhtml#s145\">NMF modelling</a></li>\n"
"<li><a href=\"c04.xhtml#s146\">Disadvantages of NMF</a></li></ol></li>\n"
"<li><a href=\"c04.xhtml#s147\">Conclusion</a></li>\n"
"<li><a href=\"c04.xhtml#s148\">Questions</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c04.xhtml#s149\">Answers</a></li></ol></li>\n"
"<li><a href=\"c04.xhtml#s150\">Projects</a></li></ol></li>\n"
"<li><a href=\"c05.xhtml\">5. Unsupervised Sentiment Classification</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c05.xhtml#s151\">Introduction</a></li>\n"
"<li><a href=\"c05.xhtml#s152\">Structure</a></li>\n"
"<li><a href=\"c05.xhtml#s153\">Objective</a></li>\n"
"<li><a href=\"c05.xhtml#s154\">Lexicon-based approach</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c05.xhtml#s155\">About the dataset</a></li>\n"
"<li><a href=\"c05.xhtml#s156\">Loading necessary libraries</a></li>\n"
"<li><a href=\"c05.xhtml#s157\">Importing the dataset</a></li>\n"
"<li><a href=\"c05.xhtml#s158\">Some pre-processings</a></li>\n"
"<li><a href=\"c05.xhtml#s159\">Defining a function to perform the following</a></li></ol></li>\n"
"<li><a href=\"c05.xhtml#s160\">Opinion lexicon</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c05.xhtml#s161\">Importing the opinion lexicon</a></li>\n"
"<li><a href=\"c05.xhtml#s162\">Tokenize the reviews into a sentence and form the sentence and review the ID</a></li>\n"
"<li><a href=\"c05.xhtml#s163\">Sentiment classification</a></li>\n"
"<li><a href=\"c05.xhtml#s164\">Converting the sentiments to a review level</a></li>\n"
"<li><a href=\"c05.xhtml#s165\">Converting the sentiment codes from the dataset to sentiments</a></li></ol></li>\n"
"<li><a href=\"c05.xhtml#s166\">Senti WordNet lexicon</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c05.xhtml#s167\">Function to perform SentiWordNet</a></li>\n"
"<li><a href=\"c05.xhtml#s168\">Sentiment classification</a></li>\n"
"<li><a href=\"c05.xhtml#s169\">Evaluation</a></li></ol></li>\n"
"<li><a href=\"c05.xhtml#s170\">TextBlob</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c05.xhtml#s171\">Importing libraries</a></li>\n"
"<li><a href=\"c05.xhtml#s172\">Predicting a sentiment of sample reviews</a></li>\n"
"<li><a href=\"c05.xhtml#s173\">Prediction and evaluation</a></li></ol></li>\n"
"<li><a href=\"c05.xhtml#s174\">AFINN</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c05.xhtml#s175\">Importing necessary libraries</a></li>\n"
"<li><a href=\"c05.xhtml#s176\">Sentiment classification and evaluation</a></li></ol></li>\n"
"<li><a href=\"c05.xhtml#s177\">VADER</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c05.xhtml#s178\">Importing necessary libraries</a></li>\n"
"<li><a href=\"c05.xhtml#s179\">Sentiment classification and evaluation</a></li>\n"
"<li><a href=\"c05.xhtml#s180\">Sample prediction</a></li>\n"
"<li><a href=\"c05.xhtml#s181\">Drawbacks of lexicon-based sentiment classification</a></li></ol></li>\n"
"<li><a href=\"c05.xhtml#s182\">Conclusion</a></li>\n"
"<li><a href=\"c05.xhtml#s183\">Questions</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c05.xhtml#s184\">Answers</a></li></ol></li></ol></li>\n"
"<li><a href=\"c06.xhtml\">6. Text Classification Using ML</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c06.xhtml#s185\">Introduction</a></li>\n"
"<li><a href=\"c06.xhtml#s186\">Structure</a></li>\n"
"<li><a href=\"c06.xhtml#s187\">Objectives</a></li>\n"
"<li><a href=\"c06.xhtml#s188\">Supervised learning</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c06.xhtml#s189\">About the dataset</a></li>\n"
"<li><a href=\"c06.xhtml#s190\">Loading the necessary libraries</a></li>\n"
"<li><a href=\"c06.xhtml#s191\">Importing the dataset</a></li>\n"
"<li><a href=\"c06.xhtml#s192\">Pre-processings</a></li>\n"
"<li><a href=\"c06.xhtml#s193\">Performing TF-IDF</a></li></ol></li>\n"
"<li><a href=\"c06.xhtml#s194\">Model fitting</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c06.xhtml#s195\">Logistic regression</a></li>\n"
"<li><a href=\"c06.xhtml#s196\">Lasso regularization</a></li>\n"
"<li><a href=\"c06.xhtml#s197\">Ridge regularization</a></li>\n"
"<li><a href=\"c06.xhtml#s198\">Elastic-net classifier</a></li>\n"
"<li><a href=\"c06.xhtml#s199\">Naïve Bayes algorithm</a></li>\n"
"<li><a href=\"c06.xhtml#s200\">K – Nearest Neighbors</a></li>\n"
"<li><a href=\"c06.xhtml#s201\">Decision tree</a></li>\n"
"<li><a href=\"c06.xhtml#s202\">Random forest</a></li>\n"
"<li><a href=\"c06.xhtml#s203\">Ada Boost</a></li>\n"
"<li><a href=\"c06.xhtml#s204\">Gradient boosting machine</a></li>\n"
"<li><a href=\"c06.xhtml#s205\">XG-Boost</a></li></ol></li>\n"
"<li><a href=\"c06.xhtml#s206\">Grid Search</a></li>\n"
"<li><a href=\"c06.xhtml#s207\">Conclusion</a></li>\n"
"<li><a href=\"c06.xhtml#s208\">Questions</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c06.xhtml#s209\">Answers</a></li></ol></li>\n"
"<li><a href=\"c06.xhtml#s210\">Project</a></li></ol></li>\n"
"<li><a href=\"c07.xhtml\">7. Text Classification Using Deep Learning</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c07.xhtml#s211\">Introduction</a></li>\n"
"<li><a href=\"c07.xhtml#s212\">Structure</a></li>\n"
"<li><a href=\"c07.xhtml#s213\">Objectives</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c07.xhtml#s214\">Learning about the Neural Networks</a></li></ol></li>\n"
"<li><a href=\"c07.xhtml#s215\">Neural networks for sentiment classification</a></li>\n"
"<li><a href=\"c07.xhtml#s216\">Neural networks with TF-IDF</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c07.xhtml#s217\">Installing libraries</a></li>\n"
"<li><a href=\"c07.xhtml#s218\">Importing libraries</a></li>\n"
"<li><a href=\"c07.xhtml#s219\">Importing the dataset</a></li>\n"
"<li><a href=\"c07.xhtml#s220\">Pre-processings</a></li>\n"
"<li><a href=\"c07.xhtml#s221\">Train, test, and validation set</a></li>\n"
"<li><a href=\"c07.xhtml#s222\">Performing TF-IDF</a></li>\n"
"<li><a href=\"c07.xhtml#s223\">Model building</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c07.xhtml#s224\">Linear regression</a></li>\n"
"<li><a href=\"c07.xhtml#s225\">Increasing the dimensionality</a></li></ol></li>\n"
"<li><a href=\"c07.xhtml#s226\">Activation functions</a></li>\n"
"<li><a href=\"c07.xhtml#s227\">Model fitting</a></li>\n"
"<li><a href=\"c07.xhtml#s228\">Cross – validation</a></li></ol></li>\n"
"<li><a href=\"c07.xhtml#s229\">Neural networks with word2vec:</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c07.xhtml#s230\">Data splitting</a></li>\n"
"<li><a href=\"c07.xhtml#s231\">Creating a Word2Vec model</a></li>\n"
"<li><a href=\"c07.xhtml#s232\">Word2Vec model fitting</a></li>\n"
"<li><a href=\"c07.xhtml#s233\">Creating word vectors</a></li>\n"
"<li><a href=\"c07.xhtml#s234\">Padding sequences</a></li>\n"
"<li><a href=\"c07.xhtml#s235\">ANN model building</a></li>\n"
"<li><a href=\"c07.xhtml#s236\">Model fitting</a></li>\n"
"<li><a href=\"c07.xhtml#s237\">Cross-validation</a></li>\n"
"<li><a href=\"c07.xhtml#s238\">Sentiment analysis using LSTM</a></li>\n"
"<li><a href=\"c07.xhtml#s239\">Importing the dataset</a></li>\n"
"<li><a href=\"c07.xhtml#s240\">Pre-processings</a></li>\n"
"<li><a href=\"c07.xhtml#s241\">Data splitting and padding</a></li>\n"
"<li><a href=\"c07.xhtml#s242\">LSTM model building</a></li>\n"
"<li><a href=\"c07.xhtml#s243\">Cross-validation</a></li>\n"
"<li><a href=\"c07.xhtml#s244\">Comparison of results</a></li></ol></li>\n"
"<li><a href=\"c07.xhtml#s245\">Conclusion</a></li>\n"
"<li><a href=\"c07.xhtml#s246\">Questions</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c07.xhtml#s247\">Answers</a></li></ol></li></ol></li>\n"
"<li><a href=\"c08.xhtml\">8. Recommendation Engine</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c08.xhtml#s248\">Introduction</a></li>\n"
"<li><a href=\"c08.xhtml#s249\">Structure</a></li>\n"
"<li><a href=\"c08.xhtml#s250\">Objective</a></li>\n"
"<li><a href=\"c08.xhtml#s251\">Applications</a></li>\n"
"<li><a href=\"c08.xhtml#s252\">Classification of a recommendation system</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c08.xhtml#s253\">Simple rule-based recommenders</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c08.xhtml#s254\">About the dataset</a></li>\n"
"<li><a href=\"c08.xhtml#s255\">Installing and loading necessary libraries</a></li>\n"
"<li><a href=\"c08.xhtml#s256\">Importing the dataset</a></li>\n"
"<li><a href=\"c08.xhtml#s257\">Building a simple rule-based recommendation system</a></li>\n"
"<li><a href=\"c08.xhtml#s258\">Weighted ratings calculation</a></li>\n"
"<li><a href=\"c08.xhtml#s259\">Applying the calculation on the filtered records</a></li></ol></li>\n"
"<li><a href=\"c08.xhtml#s260\">Content based</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c08.xhtml#s261\">Using document similarity</a></li>\n"
"<li><a href=\"c08.xhtml#s262\">About the dataset</a></li>\n"
"<li><a href=\"c08.xhtml#s263\">Installing and loading necessary libraries</a></li>\n"
"<li><a href=\"c08.xhtml#s264\">Importing the dataset</a></li>\n"
"<li><a href=\"c08.xhtml#s265\">Some pre-processing</a></li></ol></li></ol></li>\n"
"<li><a href=\"c08.xhtml#s266\">Extract TF-IDF features</a></li>\n"
"<li><a href=\"c08.xhtml#s267\">Computing pairwise document similarity</a></li>\n"
"<li><a href=\"c08.xhtml#s268\">Building a movie recommender</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c08.xhtml#s269\">Using word embedding</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c08.xhtml#s270\">FastText</a></li></ol></li>\n"
"<li><a href=\"c08.xhtml#s271\">Generate document-level embeddings</a></li>\n"
"<li><a href=\"c08.xhtml#s272\">Collaborative-based</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c08.xhtml#s273\">User-based</a></li>\n"
"<li><a href=\"c08.xhtml#s274\">About the dataset</a></li>\n"
"<li><a href=\"c08.xhtml#s275\">Installing and loading necessary libraries</a></li>\n"
"<li><a href=\"c08.xhtml#s276\">Importing the dataset</a></li></ol></li></ol></li>\n"
"<li><a href=\"c08.xhtml#s277\">Advantages of a recommendation system</a></li>\n"
"<li><a href=\"c08.xhtml#s278\">Conclusion</a></li>\n"
"<li><a href=\"c08.xhtml#s279\">Questions</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c08.xhtml#s280\">Answers</a></li></ol></li></ol></li>\n"
"<li><a href=\"c09.xhtml\">9. Machine Translation</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c09.xhtml#s281\">Introduction</a></li>\n"
"<li><a href=\"c09.xhtml#s282\">Structure</a></li>\n"
"<li><a href=\"c09.xhtml#s283\">Objectives</a></li>\n"
"<li><a href=\"c09.xhtml#s284\">Application</a></li>\n"
"<li><a href=\"c09.xhtml#s285\">Types of MT</a></li>\n"
"<li><a href=\"c09.xhtml#s286\">Readily available libraries</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c09.xhtml#s287\">TextBlob</a></li>\n"
"<li><a href=\"c09.xhtml#s288\">LangDetect</a></li>\n"
"<li><a href=\"c09.xhtml#s289\">Fasttext</a></li></ol></li>\n"
"<li><a href=\"c09.xhtml#s290\">Sequence-to-sequence modeling</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c09.xhtml#s291\">About the dataset</a></li>\n"
"<li><a href=\"c09.xhtml#s292\">Installing and loading necessary libraries:</a></li>\n"
"<li><a href=\"c09.xhtml#s293\">Importing the dataset</a></li>\n"
"<li><a href=\"c09.xhtml#s294\">Preprocessing</a></li></ol></li>\n"
"<li><a href=\"c09.xhtml#s295\">Model building (using LSTM)</a></li>\n"
"<li><a href=\"c09.xhtml#s296\">Conclusion</a></li>\n"
"<li><a href=\"c09.xhtml#s297\">Exercise</a></li>\n"
"<li><a href=\"c09.xhtml#s298\">Questions</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c09.xhtml#s299\">Answers</a></li></ol></li></ol></li>\n"
"<li><a href=\"c10.xhtml\">10. Transfer Learning</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c10.xhtml#s300\">Introduction</a></li>\n"
"<li><a href=\"c10.xhtml#s301\">Structure</a></li>\n"
"<li><a href=\"c10.xhtml#s302\">Objectives</a></li>\n"
"<li><a href=\"c10.xhtml#s303\">Universal Sentence Encoder</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c10.xhtml#s304\">Goal</a></li></ol></li>\n"
"<li><a href=\"c10.xhtml#s305\">What is a transformer and do we need it?</a></li>\n"
"<li><a href=\"c10.xhtml#s306\">Deep Averaging Network (DAN)</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c10.xhtml#s307\">About the data</a></li>\n"
"<li><a href=\"c10.xhtml#s308\">Data pre-processing</a></li></ol></li>\n"
"<li><a href=\"c10.xhtml#s309\">Bidirectional Encoder Representation from Transformer (BERT)</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c10.xhtml#s310\">What is the necessity of BERT?</a></li>\n"
"<li><a href=\"c10.xhtml#s311\">The main idea behind BERT</a></li>\n"
"<li><a href=\"c10.xhtml#s312\">Why is BERT so powerful?</a></li>\n"
"<li><a href=\"c10.xhtml#s313\">BERT architecture</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c10.xhtml#s314\">Text processing</a></li>\n"
"<li><a href=\"c10.xhtml#s315\">Pre-training tasks</a></li></ol></li></ol></li>\n"
"<li><a href=\"c10.xhtml#s316\">Fine tuning</a></li>\n"
"<li><a href=\"c10.xhtml#s317\">Drawbacks</a></li>\n"
"<li><a href=\"c10.xhtml#s318\">Conclusion</a></li>\n"
"<li><a href=\"c10.xhtml#s319\">Multiple choice questions</a>\n"
"<ol epub:type=\"list\">\n"
"<li><a href=\"c10.xhtml#s320\">Answers</a></li></ol></li>\n"
"<li><a href=\"c10.xhtml#s321\">Project</a></li></ol></li>\n"
"<li><a href=\"ind.xhtml\">Index</a></li>\n"
"</ol>\n"
"</nav>\n"
"<nav epub:type=\"landmarks\">\n"
"<h3>Guide</h3>\n"
"<ol epub:type=\"list\">\n"
"<li><a epub:type=\"titlepage\" href=\"tp.xhtml\">Title Page</a></li>\n"
"<li><a epub:type=\"copyright-page\" href=\"cop.xhtml\">Copyright Page</a></li>\n"
"<li><a epub:type=\"toc\" href=\"toc.xhtml\">Table of Contents</a></li>\n"
"<li><a epub:type=\"bodymatter\" href=\"c01.xhtml\">1. Basic Text Processing Techniques</a></li>\n"
"</ol>\n"
"</nav>\n"
"</body>\n"
"</html>")
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html