use strict;
my $str = '<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html>
<html lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
<title>Python Text Mining: Perform Text Processing, Word Embedding, Text Classification and Machine Translation</title>
<meta http-equiv="default-style" content="application/xhtml+xml; charset=utf-8" />
<style>ol{list-style-type:none;}a{text-decoration:none;}</style>
</head>
<body>
<nav id="toc" epub:type="toc">
<h1>Table of Contents</h1>
<div>jkas<div>fbksafbjksa</div>dfdfdfd</div>
<ol epub:type="list">
<li><a href="cvi.xhtml#cvi">Cover Page</a></li>
<li><a href="tp.xhtml#s1">Title
Page</a></li>
<li><a href="cop.xhtml">Copyright Page</a></li>
<li><a href="ded.xhtml">Dedication Page</a></li>
<li><a href="ata.xhtml">About the Author</a></li>
<li><a href="fm.xhtml">About the Reviewer</a></li>
<li><a href="ack.xhtml">Acknowledgement</a></li>
<li><a href="pre.xhtml">Preface</a></li>
<li><a href="fm1.xhtml">Errata</a></li>
<li><a href="toc.xhtml">Table of Contents</a></li>
<li><a href="c01.xhtml">1. Basic Text Processing Techniques</a>
<ol epub:type="list">
<li><a href="c01.xhtml#s1">Introduction</a></li>
<li><a href="c01.xhtml#s2">Structure</a></li>
<li><a href="c01.xhtml#s3">Objectives</a></li>
<li><a href="c01.xhtml#s4">Data preparation</a></li>
<li><a href="c01.xhtml#s5">Project 1: Twitter data analysis</a>
<ol epub:type="list">
<li><a href="c01.xhtml#s6">Scraping the data</a></li>
<li><a href="c01.xhtml#s7">Data pre-processing</a></li>
<li><a href="c01.xhtml#s8">Importing necessary packages</a></li>
<li><a href="c01.xhtml#s9">HTML parsing</a></li>
<li><a href="c01.xhtml#s10">Removing accented characters</a></li>
<li><a href="c01.xhtml#s11">Expanding contractions</a></li>
<li><a href="c01.xhtml#s12">Lemmetization and stemming</a>
<ol epub:type="list">
<li><a href="c01.xhtml#s13">Fail case</a></li></ol></li>
<li><a href="c01.xhtml#s14">Removing special characters</a></li>
<li><a href="c01.xhtml#s15">Removing stop words</a></li>
<li><a href="c01.xhtml#s16">Handling emojis or emoticons</a></li>
<li><a href="c01.xhtml#s17">Emoji removal</a></li>
<li><a href="c01.xhtml#s18">Text acronym abbreviation</a></li>
<li><a href="c01.xhtml#s19">Twitter data processing</a></li>
<li><a href="c01.xhtml#s20">Extracting usertags and hashtags</a></li></ol></li>
<li><a href="c01.xhtml#s21">Project 2: In-shots data pre-processing</a>
<ol epub:type="list">
<li><a href="c01.xhtml#s22">Importing the necessary packages</a></li>
<li><a href="c01.xhtml#s23">Setting the urls for data extraction</a></li>
<li><a href="c01.xhtml#s24">Function to scrape data from the urls</a></li>
<li><a href="c01.xhtml#s25">Importing packages</a></li></ol></li>
<li><a href="c01.xhtml#s26">Conclusion</a></li>
<li><a href="c01.xhtml#s27">Questions</a></li>
<li><a href="c01.xhtml#s28">Multiple choice questions</a>
<ol epub:type="list">
<li><a href="c01.xhtml#s29">Answers</a></li></ol></li></ol></li>
<li><a href="c02.xhtml">2. Text to Numbers</a>
<ol epub:type="list">
<li><a href="c02.xhtml#s30">Introduction</a></li>
<li><a href="c02.xhtml#s31">Structure</a></li>
<li><a href="c02.xhtml#s32">Objectives</a></li>
<li><a href="c02.xhtml#s33">Feature encoding or engineering</a></li>
<li><a href="c02.xhtml#s34">One-hot encoding</a>
<ol epub:type="list">
<li><a href="c02.xhtml#s35">Corpus</a></li>
<li><a href="c02.xhtml#s36">Code</a></li>
<li><a href="c02.xhtml#s37">Creating the text corpus</a></li>
<li><a href="c02.xhtml#s38">Some basic pre-processings</a>
<ol epub:type="list">
<li><a href="c02.xhtml#s39">Min_df</a></li>
<li><a href="c02.xhtml#s40">Max_df</a></li></ol></li>
<li><a href="c02.xhtml#s41">Limitations</a></li></ol></li>
<li><a href="c02.xhtml#s42">Bag of words</a>
<ol epub:type="list">
<li><a href="c02.xhtml#s43">Code</a></li>
<li><a href="c02.xhtml#s44">Performing bag-of-words using sklearn</a></li>
<li><a href="c02.xhtml#s45">Difference between one-hot encoding and bag of words</a></li>
<li><a href="c02.xhtml#s46">Limitations</a></li></ol></li>
<li><a href="c02.xhtml#s47">N-gram model</a>
<ol epub:type="list">
<li><a href="c02.xhtml#s48">Limitations</a></li></ol></li>
<li><a href="c02.xhtml#s49">TF-IDF</a>
<ol epub:type="list">
<li><a href="c02.xhtml#s50">Code</a></li>
<li><a href="c02.xhtml#s51">Performing TF-IDF using sklearn</a></li></ol></li>
<li><a href="c02.xhtml#s52">Project -1</a></li>
<li><a href="c02.xhtml#s53">Solution</a>
<ol epub:type="list">
<li><a href="c02.xhtml#s54">Loading the dataset</a></li>
<li><a href="c02.xhtml#s55">Some basic pre-processings</a></li>
<li><a href="c02.xhtml#s56">One-hot encoding</a></li>
<li><a href="c02.xhtml#s57">Bag of words</a></li>
<li><a href="c02.xhtml#s58">Bag of N-grams model</a></li></ol></li>
<li><a href="c02.xhtml#s59">Project -2</a>
<ol epub:type="list">
<li><a href="c02.xhtml#s60">Loading the dataset</a></li>
<li><a href="c02.xhtml#s61">Some basic pre-processings</a></li>
<li><a href="c02.xhtml#s62">TF-IDF</a></li>
<li><a href="c02.xhtml#s63">Comparison of One-Hot, BOW, and TF-IDF</a></li></ol></li>
<li><a href="c02.xhtml#s64">Conclusion</a></li>
<li><a href="c02.xhtml#s65">Questions</a></li>
<li><a href="c02.xhtml#s66">Multiple choice questions</a>
<ol epub:type="list">
<li><a href="c02.xhtml#s67">Answers</a></li></ol></li></ol></li>
<li><a href="c03.xhtml">3. Word Embeddings</a>
<ol epub:type="list">
<li><a href="c03.xhtml#s68">Introduction</a></li>
<li><a href="c03.xhtml#s69">Structure</a></li>
<li><a href="c03.xhtml#s70">Objective</a></li>
<li><a href="c03.xhtml#s71">Word vectors or word embeddings</a>
<ol epub:type="list">
<li><a href="c03.xhtml#s72">Difference between word embeddings and TF-IDF</a></li></ol></li>
<li><a href="c03.xhtml#s73">Feature engineering with word embeddings</a></li>
<li><a href="c03.xhtml#s74">Word2Vec</a>
<ol epub:type="list">
<li><a href="c03.xhtml#s75">Code</a></li>
<li><a href="c03.xhtml#s76">t-SNE</a></li>
<li><a href="c03.xhtml#s77">Word similarity dataframe</a></li></ol></li>
<li><a href="c03.xhtml#s78">Global Vector (GloVe) Model</a>
<ol epub:type="list">
<li><a href="c03.xhtml#s79">The GloVe Model using Spacy</a></li>
<li><a href="c03.xhtml#s80">Loading the downloaded vector model</a></li>
<li><a href="c03.xhtml#s81">Word vector dataframe</a></li>
<li><a href="c03.xhtml#s82">t-SNE visualization</a></li>
<li><a href="c03.xhtml#s83">Word similarity dataframe</a></li></ol></li>
<li><a href="c03.xhtml#s84">fastText</a>
<ol epub:type="list">
<li><a href="c03.xhtml#s85">fastText using Gensim</a></li>
<li><a href="c03.xhtml#s86">t-SNE visualization</a></li>
<li><a href="c03.xhtml#s87">Finding Odd word out using FastText</a></li></ol></li>
<li><a href="c03.xhtml#s88">Difference between Word2Vec, GloVe, and FastText</a></li>
<li><a href="c03.xhtml#s89">Using pre-trained word embeddings</a>
<ol epub:type="list">
<li><a href="c03.xhtml#s90">Importing necessary libraries</a></li>
<li><a href="c03.xhtml#s91">Loading the Word2Vec model</a></li>
<li><a href="c03.xhtml#s92">Sample data initialization</a></li>
<li><a href="c03.xhtml#s93">Pre-processings and word tokenizations</a></li>
<li><a href="c03.xhtml#s94">Extracting list of unique words</a></li>
<li><a href="c03.xhtml#s95">t-SNE visualization</a></li></ol></li>
<li><a href="c03.xhtml#s96">Project</a>
<ol epub:type="list">
<li><a href="c03.xhtml#s97">Solution</a></li>
<li><a href="c03.xhtml#s98">Importing necessary libraries</a></li>
<li><a href="c03.xhtml#s99">Loading the Word2Vec model</a></li>
<li><a href="c03.xhtml#s100">Scrapping data from inshots</a></li>
<li><a href="c03.xhtml#s101">Pre-processings and word tokenizations</a></li>
<li><a href="c03.xhtml#s102">Extracting list of unique words</a></li>
<li><a href="c03.xhtml#s103">Removing words not in vocab</a></li>
<li><a href="c03.xhtml#s104">t-SNE visualization</a></li></ol></li>
<li><a href="c03.xhtml#s105">Conclusion</a>
<ol epub:type="list">
<li><a href="c03.xhtml#s106">Project</a></li></ol></li></ol></li>
<li><a href="c04.xhtml">4. Topic Modeling</a>
<ol epub:type="list">
<li><a href="c04.xhtml#s107">Introduction</a></li>
<li><a href="c04.xhtml#s108">Structure</a></li>
<li><a href="c04.xhtml#s109">Objectives</a></li>
<li><a href="c04.xhtml#s110">Topic modeling</a>
<ol epub:type="list">
<li><a href="c04.xhtml#s111">Identity a matrix</a></li>
<li><a href="c04.xhtml#s112">Unitary matrix</a></li>
<li><a href="c04.xhtml#s113">Eigen values and Eigen vectors</a></li>
<li><a href="c04.xhtml#s114">Singular value decomposition</a></li>
<li><a href="c04.xhtml#s115">Latent semantic indexing</a></li>
<li><a href="c04.xhtml#s116">TF-IDF vectorization</a></li>
<li><a href="c04.xhtml#s117">Building an SVD model</a></li>
<li><a href="c04.xhtml#s118">Looking at the topics and the words contributing to the topic</a></li>
<li><a href="c04.xhtml#s119">Advantages and disadvantages of LSI</a></li></ol></li>
<li><a href="c04.xhtml#s120">Latent Dirichlet Allocation</a>
<ol epub:type="list">
<li><a href="c04.xhtml#s121">Introduction</a></li>
<li><a href="c04.xhtml#s122">Working</a></li>
<li><a href="c04.xhtml#s123">About the data</a></li>
<li><a href="c04.xhtml#s124">Some pre-processing</a></li>
<li><a href="c04.xhtml#s125">Looking at the top 20 frequently used words</a></li>
<li><a href="c04.xhtml#s126">Some EDA</a></li>
<li><a href="c04.xhtml#s127">Generating Bi-grams (BoW)</a></li>
<li><a href="c04.xhtml#s128">LDA model fitting</a></li>
<li><a href="c04.xhtml#s129">LDA using Gensim and its visualization</a></li>
<li><a href="c04.xhtml#s130">Importing the data</a></li>
<li><a href="c04.xhtml#s131">Some pre-processing</a></li>
<li><a href="c04.xhtml#s132">Extending stop words and building ngram models</a></li>
<li><a href="c04.xhtml#s133">Creating term document frequency and the LDA model</a></li>
<li><a href="c04.xhtml#s134">Dominant topic identification</a></li>
<li><a href="c04.xhtml#s135">PyLDAvis</a></li>
<li><a href="c04.xhtml#s136">Disadvantages of LDA</a></li></ol></li>
<li><a href="c04.xhtml#s137">Non-Negative Matrix Factorization (NMF)</a>
<ol epub:type="list">
<li><a href="c04.xhtml#s138">Importing necessary libraries</a></li>
<li><a href="c04.xhtml#s139">Some pre-processing</a></li>
<li><a href="c04.xhtml#s140">Looking at the top 20 frequently used words</a></li>
<li><a href="c04.xhtml#s141">Some EDA</a></li>
<li><a href="c04.xhtml#s142">Generating Bi-grams (BoW)</a></li>
<li><a href="c04.xhtml#s143">Building TF-IDF vectorizer</a></li>
<li><a href="c04.xhtml#s144">Visualizing ranks with the TF-IDF weights</a></li>
<li><a href="c04.xhtml#s145">NMF modelling</a></li>
<li><a href="c04.xhtml#s146">Disadvantages of NMF</a></li></ol></li>
<li><a href="c04.xhtml#s147">Conclusion</a></li>
<li><a href="c04.xhtml#s148">Questions</a>
<ol epub:type="list">
<li><a href="c04.xhtml#s149">Answers</a></li></ol></li>
<li><a href="c04.xhtml#s150">Projects</a></li></ol></li>
<li><a href="c05.xhtml">5. Unsupervised Sentiment Classification</a>
<ol epub:type="list">
<li><a href="c05.xhtml#s151">Introduction</a></li>
<li><a href="c05.xhtml#s152">Structure</a></li>
<li><a href="c05.xhtml#s153">Objective</a></li>
<li><a href="c05.xhtml#s154">Lexicon-based approach</a>
<ol epub:type="list">
<li><a href="c05.xhtml#s155">About the dataset</a></li>
<li><a href="c05.xhtml#s156">Loading necessary libraries</a></li>
<li><a href="c05.xhtml#s157">Importing the dataset</a></li>
<li><a href="c05.xhtml#s158">Some pre-processings</a></li>
<li><a href="c05.xhtml#s159">Defining a function to perform the following</a></li></ol></li>
<li><a href="c05.xhtml#s160">Opinion lexicon</a>
<ol epub:type="list">
<li><a href="c05.xhtml#s161">Importing the opinion lexicon</a></li>
<li><a href="c05.xhtml#s162">Tokenize the reviews into a sentence and form the sentence and review the ID</a></li>
<li><a href="c05.xhtml#s163">Sentiment classification</a></li>
<li><a href="c05.xhtml#s164">Converting the sentiments to a review level</a></li>
<li><a href="c05.xhtml#s165">Converting the sentiment codes from the dataset to sentiments</a></li></ol></li>
<li><a href="c05.xhtml#s166">Senti WordNet lexicon</a>
<ol epub:type="list">
<li><a href="c05.xhtml#s167">Function to perform SentiWordNet</a></li>
<li><a href="c05.xhtml#s168">Sentiment classification</a></li>
<li><a href="c05.xhtml#s169">Evaluation</a></li></ol></li>
<li><a href="c05.xhtml#s170">TextBlob</a>
<ol epub:type="list">
<li><a href="c05.xhtml#s171">Importing libraries</a></li>
<li><a href="c05.xhtml#s172">Predicting a sentiment of sample reviews</a></li>
<li><a href="c05.xhtml#s173">Prediction and evaluation</a></li></ol></li>
<li><a href="c05.xhtml#s174">AFINN</a>
<ol epub:type="list">
<li><a href="c05.xhtml#s175">Importing necessary libraries</a></li>
<li><a href="c05.xhtml#s176">Sentiment classification and evaluation</a></li></ol></li>
<li><a href="c05.xhtml#s177">VADER</a>
<ol epub:type="list">
<li><a href="c05.xhtml#s178">Importing necessary libraries</a></li>
<li><a href="c05.xhtml#s179">Sentiment classification and evaluation</a></li>
<li><a href="c05.xhtml#s180">Sample prediction</a></li>
<li><a href="c05.xhtml#s181">Drawbacks of lexicon-based sentiment classification</a></li></ol></li>
<li><a href="c05.xhtml#s182">Conclusion</a></li>
<li><a href="c05.xhtml#s183">Questions</a>
<ol epub:type="list">
<li><a href="c05.xhtml#s184">Answers</a></li></ol></li></ol></li>
<li><a href="c06.xhtml">6. Text Classification Using ML</a>
<ol epub:type="list">
<li><a href="c06.xhtml#s185">Introduction</a></li>
<li><a href="c06.xhtml#s186">Structure</a></li>
<li><a href="c06.xhtml#s187">Objectives</a></li>
<li><a href="c06.xhtml#s188">Supervised learning</a>
<ol epub:type="list">
<li><a href="c06.xhtml#s189">About the dataset</a></li>
<li><a href="c06.xhtml#s190">Loading the necessary libraries</a></li>
<li><a href="c06.xhtml#s191">Importing the dataset</a></li>
<li><a href="c06.xhtml#s192">Pre-processings</a></li>
<li><a href="c06.xhtml#s193">Performing TF-IDF</a></li></ol></li>
<li><a href="c06.xhtml#s194">Model fitting</a>
<ol epub:type="list">
<li><a href="c06.xhtml#s195">Logistic regression</a></li>
<li><a href="c06.xhtml#s196">Lasso regularization</a></li>
<li><a href="c06.xhtml#s197">Ridge regularization</a></li>
<li><a href="c06.xhtml#s198">Elastic-net classifier</a></li>
<li><a href="c06.xhtml#s199">Naïve Bayes algorithm</a></li>
<li><a href="c06.xhtml#s200">K – Nearest Neighbors</a></li>
<li><a href="c06.xhtml#s201">Decision tree</a></li>
<li><a href="c06.xhtml#s202">Random forest</a></li>
<li><a href="c06.xhtml#s203">Ada Boost</a></li>
<li><a href="c06.xhtml#s204">Gradient boosting machine</a></li>
<li><a href="c06.xhtml#s205">XG-Boost</a></li></ol></li>
<li><a href="c06.xhtml#s206">Grid Search</a></li>
<li><a href="c06.xhtml#s207">Conclusion</a></li>
<li><a href="c06.xhtml#s208">Questions</a>
<ol epub:type="list">
<li><a href="c06.xhtml#s209">Answers</a></li></ol></li>
<li><a href="c06.xhtml#s210">Project</a></li></ol></li>
<li><a href="c07.xhtml">7. Text Classification Using Deep Learning</a>
<ol epub:type="list">
<li><a href="c07.xhtml#s211">Introduction</a></li>
<li><a href="c07.xhtml#s212">Structure</a></li>
<li><a href="c07.xhtml#s213">Objectives</a>
<ol epub:type="list">
<li><a href="c07.xhtml#s214">Learning about the Neural Networks</a></li></ol></li>
<li><a href="c07.xhtml#s215">Neural networks for sentiment classification</a></li>
<li><a href="c07.xhtml#s216">Neural networks with TF-IDF</a>
<ol epub:type="list">
<li><a href="c07.xhtml#s217">Installing libraries</a></li>
<li><a href="c07.xhtml#s218">Importing libraries</a></li>
<li><a href="c07.xhtml#s219">Importing the dataset</a></li>
<li><a href="c07.xhtml#s220">Pre-processings</a></li>
<li><a href="c07.xhtml#s221">Train, test, and validation set</a></li>
<li><a href="c07.xhtml#s222">Performing TF-IDF</a></li>
<li><a href="c07.xhtml#s223">Model building</a>
<ol epub:type="list">
<li><a href="c07.xhtml#s224">Linear regression</a></li>
<li><a href="c07.xhtml#s225">Increasing the dimensionality</a></li></ol></li>
<li><a href="c07.xhtml#s226">Activation functions</a></li>
<li><a href="c07.xhtml#s227">Model fitting</a></li>
<li><a href="c07.xhtml#s228">Cross – validation</a></li></ol></li>
<li><a href="c07.xhtml#s229">Neural networks with word2vec:</a>
<ol epub:type="list">
<li><a href="c07.xhtml#s230">Data splitting</a></li>
<li><a href="c07.xhtml#s231">Creating a Word2Vec model</a></li>
<li><a href="c07.xhtml#s232">Word2Vec model fitting</a></li>
<li><a href="c07.xhtml#s233">Creating word vectors</a></li>
<li><a href="c07.xhtml#s234">Padding sequences</a></li>
<li><a href="c07.xhtml#s235">ANN model building</a></li>
<li><a href="c07.xhtml#s236">Model fitting</a></li>
<li><a href="c07.xhtml#s237">Cross-validation</a></li>
<li><a href="c07.xhtml#s238">Sentiment analysis using LSTM</a></li>
<li><a href="c07.xhtml#s239">Importing the dataset</a></li>
<li><a href="c07.xhtml#s240">Pre-processings</a></li>
<li><a href="c07.xhtml#s241">Data splitting and padding</a></li>
<li><a href="c07.xhtml#s242">LSTM model building</a></li>
<li><a href="c07.xhtml#s243">Cross-validation</a></li>
<li><a href="c07.xhtml#s244">Comparison of results</a></li></ol></li>
<li><a href="c07.xhtml#s245">Conclusion</a></li>
<li><a href="c07.xhtml#s246">Questions</a>
<ol epub:type="list">
<li><a href="c07.xhtml#s247">Answers</a></li></ol></li></ol></li>
<li><a href="c08.xhtml">8. Recommendation Engine</a>
<ol epub:type="list">
<li><a href="c08.xhtml#s248">Introduction</a></li>
<li><a href="c08.xhtml#s249">Structure</a></li>
<li><a href="c08.xhtml#s250">Objective</a></li>
<li><a href="c08.xhtml#s251">Applications</a></li>
<li><a href="c08.xhtml#s252">Classification of a recommendation system</a>
<ol epub:type="list">
<li><a href="c08.xhtml#s253">Simple rule-based recommenders</a>
<ol epub:type="list">
<li><a href="c08.xhtml#s254">About the dataset</a></li>
<li><a href="c08.xhtml#s255">Installing and loading necessary libraries</a></li>
<li><a href="c08.xhtml#s256">Importing the dataset</a></li>
<li><a href="c08.xhtml#s257">Building a simple rule-based recommendation system</a></li>
<li><a href="c08.xhtml#s258">Weighted ratings calculation</a></li>
<li><a href="c08.xhtml#s259">Applying the calculation on the filtered records</a></li></ol></li>
<li><a href="c08.xhtml#s260">Content based</a>
<ol epub:type="list">
<li><a href="c08.xhtml#s261">Using document similarity</a></li>
<li><a href="c08.xhtml#s262">About the dataset</a></li>
<li><a href="c08.xhtml#s263">Installing and loading necessary libraries</a></li>
<li><a href="c08.xhtml#s264">Importing the dataset</a></li>
<li><a href="c08.xhtml#s265">Some pre-processing</a></li></ol></li></ol></li>
<li><a href="c08.xhtml#s266">Extract TF-IDF features</a></li>
<li><a href="c08.xhtml#s267">Computing pairwise document similarity</a></li>
<li><a href="c08.xhtml#s268">Building a movie recommender</a>
<ol epub:type="list">
<li><a href="c08.xhtml#s269">Using word embedding</a>
<ol epub:type="list">
<li><a href="c08.xhtml#s270">FastText</a></li></ol></li>
<li><a href="c08.xhtml#s271">Generate document-level embeddings</a></li>
<li><a href="c08.xhtml#s272">Collaborative-based</a>
<ol epub:type="list">
<li><a href="c08.xhtml#s273">User-based</a></li>
<li><a href="c08.xhtml#s274">About the dataset</a></li>
<li><a href="c08.xhtml#s275">Installing and loading necessary libraries</a></li>
<li><a href="c08.xhtml#s276">Importing the dataset</a></li></ol></li></ol></li>
<li><a href="c08.xhtml#s277">Advantages of a recommendation system</a></li>
<li><a href="c08.xhtml#s278">Conclusion</a></li>
<li><a href="c08.xhtml#s279">Questions</a>
<ol epub:type="list">
<li><a href="c08.xhtml#s280">Answers</a></li></ol></li></ol></li>
<li><a href="c09.xhtml">9. Machine Translation</a>
<ol epub:type="list">
<li><a href="c09.xhtml#s281">Introduction</a></li>
<li><a href="c09.xhtml#s282">Structure</a></li>
<li><a href="c09.xhtml#s283">Objectives</a></li>
<li><a href="c09.xhtml#s284">Application</a></li>
<li><a href="c09.xhtml#s285">Types of MT</a></li>
<li><a href="c09.xhtml#s286">Readily available libraries</a>
<ol epub:type="list">
<li><a href="c09.xhtml#s287">TextBlob</a></li>
<li><a href="c09.xhtml#s288">LangDetect</a></li>
<li><a href="c09.xhtml#s289">Fasttext</a></li></ol></li>
<li><a href="c09.xhtml#s290">Sequence-to-sequence modeling</a>
<ol epub:type="list">
<li><a href="c09.xhtml#s291">About the dataset</a></li>
<li><a href="c09.xhtml#s292">Installing and loading necessary libraries:</a></li>
<li><a href="c09.xhtml#s293">Importing the dataset</a></li>
<li><a href="c09.xhtml#s294">Preprocessing</a></li></ol></li>
<li><a href="c09.xhtml#s295">Model building (using LSTM)</a></li>
<li><a href="c09.xhtml#s296">Conclusion</a></li>
<li><a href="c09.xhtml#s297">Exercise</a></li>
<li><a href="c09.xhtml#s298">Questions</a>
<ol epub:type="list">
<li><a href="c09.xhtml#s299">Answers</a></li></ol></li></ol></li>
<li><a href="c10.xhtml">10. Transfer Learning</a>
<ol epub:type="list">
<li><a href="c10.xhtml#s300">Introduction</a></li>
<li><a href="c10.xhtml#s301">Structure</a></li>
<li><a href="c10.xhtml#s302">Objectives</a></li>
<li><a href="c10.xhtml#s303">Universal Sentence Encoder</a>
<ol epub:type="list">
<li><a href="c10.xhtml#s304">Goal</a></li></ol></li>
<li><a href="c10.xhtml#s305">What is a transformer and do we need it?</a></li>
<li><a href="c10.xhtml#s306">Deep Averaging Network (DAN)</a>
<ol epub:type="list">
<li><a href="c10.xhtml#s307">About the data</a></li>
<li><a href="c10.xhtml#s308">Data pre-processing</a></li></ol></li>
<li><a href="c10.xhtml#s309">Bidirectional Encoder Representation from Transformer (BERT)</a>
<ol epub:type="list">
<li><a href="c10.xhtml#s310">What is the necessity of BERT?</a></li>
<li><a href="c10.xhtml#s311">The main idea behind BERT</a></li>
<li><a href="c10.xhtml#s312">Why is BERT so powerful?</a></li>
<li><a href="c10.xhtml#s313">BERT architecture</a>
<ol epub:type="list">
<li><a href="c10.xhtml#s314">Text processing</a></li>
<li><a href="c10.xhtml#s315">Pre-training tasks</a></li></ol></li></ol></li>
<li><a href="c10.xhtml#s316">Fine tuning</a></li>
<li><a href="c10.xhtml#s317">Drawbacks</a></li>
<li><a href="c10.xhtml#s318">Conclusion</a></li>
<li><a href="c10.xhtml#s319">Multiple choice questions</a>
<ol epub:type="list">
<li><a href="c10.xhtml#s320">Answers</a></li></ol></li>
<li><a href="c10.xhtml#s321">Project</a></li></ol></li>
<li><a href="ind.xhtml">Index</a></li>
</ol>
</nav>
<nav epub:type="landmarks">
<h3>Guide</h3>
<ol epub:type="list">
<li><a epub:type="titlepage" href="tp.xhtml">Title Page</a></li>
<li><a epub:type="copyright-page" href="cop.xhtml">Copyright Page</a></li>
<li><a epub:type="toc" href="toc.xhtml">Table of Contents</a></li>
<li><a epub:type="bodymatter" href="c01.xhtml">1. Basic Text Processing Techniques</a></li>
</ol>
</nav>
</body>
</html>';
my $regex = qr/(((<(?P<tag>(title|table|li|p|h\d|td|li|pre|div|i|a|b|strong))\b[^>]*?>(((?!<(?P=tag)|<pre|<figure|<img|<h\d|<li\b)[\d\D])){3,}?)(<(\/(?P=tag)|figure|img)[^>]*?>))|((<(?P<tag2>(title|li|p|h\d|td|li|pre|div|i|a|b|strong))\b[^>]*?>(((?!<(?P=tag2)|<pre|<figure|<img|<h\d|<li\b)[\d\D])){3,}?)(?=<))|((?<=>)[^<]{3,}(?=<\/div>)))/mp;
if ( $str =~ /$regex/g ) {
print "Whole match is ${^MATCH} and its start/end positions can be obtained via \$-[0] and \$+[0]\n";
# print "Capture Group 1 is $1 and its start/end positions can be obtained via \$-[1] and \$+[1]\n";
# print "Capture Group 2 is $2 ... and so on\n";
}
# ${^POSTMATCH} and ${^PREMATCH} are also available with the use of '/p'
# Named capture groups can be called via $+{name}
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Perl, please visit: http://perldoc.perl.org/perlre.html