% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tp-generator.R
\name{TPGenerator}
\alias{TPGenerator}
\title{Generates transition probabilities for n-grams}
\description{
It provides a method for generating transition probabilities for
the given n-gram size. It also provides a method for generating the combined
transition probabilities data for n-gram sizes from 1 to the given size. The
combined transition probabilities data can be used to implement back-off.
}
\details{
It provides a method for generating n-gram transition probabilities.
It reads n-gram frequencies from an input text file that is generated by the
TokenGenerator class.

It parses each n-gram into a prefix, a next word, the next word frequency and
the next word probability. Maximum Likelihood count is used to generate the
next word probabilities.

Each n-gram prefix is converted to a numeric hash using the digest2int
function. The next word is replaced with the position of the next word in the
list of all words. The transition probabilities data is stored as a dataframe
in a file.

Another method is provided that combines the transition probabilities for
n-grams of size 1 to the given size. The combined transition probabilities
can be saved to a file as a data frame. This file may be regarded as a
completed self contained n-gram model. By combining the transition
probabilities of n-grams, back-off may be used to evaluate word probabilities
or predict the next word.
}
\examples{

## ------------------------------------------------
## Method `TPGenerator$generate_tp`
## ------------------------------------------------

# Start of environment setup code
# The level of detail in the information messages
ve <- 0
# The name of the folder that will contain all the files. It will be
# created in the current directory. NULL implies tempdir will be used
fn <- NULL
# The required files. They are default files that are part of the
# package
rf <- c("n1.RDS", "n2.RDS", "n3.RDS", "n4.RDS")
# An object of class EnvManager is created
em <- EnvManager$new(ve = ve, rp = "./")
# The required files are downloaded
ed <- em$setup_env(rf, fn)
# End of environment setup code

# The list of output files
fns <- c("words", "model-4", "tp2", "tp3", "tp4")

# The TPGenerator object is created
tp <- TPGenerator$new(opts = list(n = 4, dir = ed), ve = ve)
# The combined transition probabilities are generated
tp$generate_tp()

# The test environment is removed. Comment the below line, so the
# files generated by the function can be viewed
em$td_env()
}
\section{Super class}{
\code{\link[wordpredictor:Base]{wordpredictor::Base}} -> \code{TPGenerator}
}
\section{Methods}{
\subsection{Public methods}{
\itemize{
\item \href{#method-TPGenerator-new}{\code{TPGenerator$new()}}
\item \href{#method-TPGenerator-generate_tp}{\code{TPGenerator$generate_tp()}}
\item \href{#method-TPGenerator-generate_tp_for_n}{\code{TPGenerator$generate_tp_for_n()}}
\item \href{#method-TPGenerator-clone}{\code{TPGenerator$clone()}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-TPGenerator-new"></a>}}
\if{latex}{\out{\hypertarget{method-TPGenerator-new}{}}}
\subsection{Method \code{new()}}{
It initializes the current obj. It is used to set the
transition probabilities options and verbose option.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{TPGenerator$new(opts = list(), ve = 0)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{opts}}{The options for generating the transition probabilities.
\itemize{
\item \strong{save_tp}. If the data should be saved.
\item \strong{n}. The n-gram size.
\item \strong{dir}. The directory containing the input and output files.
\item \strong{format}. The format for the output. There are two options.
\itemize{
\item \strong{plain}. The data is stored in plain text.
\item \strong{obj}. The data is stored as a R obj.
}
}}

\item{\code{ve}}{The level of detail in the information messages.}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-TPGenerator-generate_tp"></a>}}
\if{latex}{\out{\hypertarget{method-TPGenerator-generate_tp}{}}}
\subsection{Method \code{generate_tp()}}{
It first generates the transition probabilities for each
n-gram of size from 1 to the given size. The transition probabilities
are then combined into a single data frame and saved to the output
folder that is given as parameter to the current object.

By combining the transition probabilities for all n-gram sizes from 1
to n, back-off can be used to calculate next word probabilities or
predict the next word.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{TPGenerator$generate_tp()}\if{html}{\out{</div>}}
}

\subsection{Examples}{
\if{html}{\out{<div class="r example copy">}}
\preformatted{# Start of environment setup code
# The level of detail in the information messages
ve <- 0
# The name of the folder that will contain all the files. It will be
# created in the current directory. NULL implies tempdir will be used
fn <- NULL
# The required files. They are default files that are part of the
# package
rf <- c("n1.RDS", "n2.RDS", "n3.RDS", "n4.RDS")
# An object of class EnvManager is created
em <- EnvManager$new(ve = ve, rp = "./")
# The required files are downloaded
ed <- em$setup_env(rf, fn)
# End of environment setup code

# The list of output files
fns <- c("words", "model-4", "tp2", "tp3", "tp4")

# The TPGenerator object is created
tp <- TPGenerator$new(opts = list(n = 4, dir = ed), ve = ve)
# The combined transition probabilities are generated
tp$generate_tp()

# The test environment is removed. Comment the below line, so the
# files generated by the function can be viewed
em$td_env()
}
\if{html}{\out{</div>}}

}

}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-TPGenerator-generate_tp_for_n"></a>}}
\if{latex}{\out{\hypertarget{method-TPGenerator-generate_tp_for_n}{}}}
\subsection{Method \code{generate_tp_for_n()}}{
It generates the transition probabilities table for the
given n-gram size. It first reads n-gram token frequencies from an
input text file.

It then generates a data frame whose columns are the
n-gram prefix, next word and next word frequency. The data frame may
be saved to a file as plain text or as a R obj. If n = 1, then the
list of words is saved.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{TPGenerator$generate_tp_for_n(n)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{n}}{The n-gram size for which the tp data is generated.}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-TPGenerator-clone"></a>}}
\if{latex}{\out{\hypertarget{method-TPGenerator-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{TPGenerator$clone(deep = FALSE)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{deep}}{Whether to make a deep clone.}
}
\if{html}{\out{</div>}}
}
}
}
