tr-unpublished.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: bib2bib -ob tr-unpublished.bib -oc /dev/null -c 'type : "tr" or type : "unpublished" or type: "Ph.D. thesis"' publications.bib}}
@phdthesis{heeringa:phd2006,
  type = {Ph.{D}. thesis},
  author = {Brent Heeringa},
  title = {\url{./thesis.pdf}{Improving Access to Organized Information}},
  school = {University of Massachusetts Amherst},
  year = 2006,
  abstract = {We introduce several new models and methods for improving access to
organized information.  The first model, Constrained Subtree Selection
(CSS), has applications in web site design and the reorganization of
directory structures.  Given a hierarchy represented as a rooted DAG
$G$ with $n$ weighted leaves, one selects a subtree of the transitive
closure of $G$ that minimizes the expected path cost.  Path cost is
the sum of the degree costs along a path from the root to a leaf.
Degree cost, $\gamma$, is a function of the out degree of a node.  We
give a sufficient condition for $\gamma$ that makes CSS {\bf NP}-complete.
This result holds even when the leaves have equal weight.  Turning to
algorithms, we give a polynomial time solution for instances of CSS
where $G$ does not constrain the choice of subtrees and $\gamma$
favors nodes with at most $k$ links.  Even though CSS remains {\bf NP}-hard
for constant degree DAGs, we give an $O(\log(k)\gamma(d+1))$
approximation for any $G$ with maximum degree $d$, provided that
$\gamma$ favors nodes with at most $k$ links.  Finally, we give a
complete characterization of the optimal trees for two special cases:
(1) linear degree cost in unconstrained graphs and uniform probability
distributions, and (2) logarithmic degree cost in arbitrary DAGs and
uniform probability distributions.

The second problem, Category Tree (CT), seeks a decision tree for
categorical data where internal nodes are categories, edges are
appropriate values for the categories, and leaves are data items.  CT
generalizes the well-studied Decision Tree (DT) problem.  Our results
resolve two open problems: We give a $\ln n +1$-approximation for DT
and show DT does not have a polynomial time approximation scheme
unless {\bf P}={\bf NP}.  Our work, while providing the first non-trivial upper
and lower bounds on approximating DT, also demonstrates that DT and a
subtly different problem which also bears the name decision tree have
fundamentally different approximation complexity.
 
We complement the above models with a new pruning method for $k$
nearest neighbor queries on R-trees.  We show that an extension to a
popular depth-first 1-nearest neighbor query results in a
theoretically better search.  We call this extension Promise-Pruning
and construct a class of R-trees where its application reduces the
search space exponentially.}
}
@techreport{heeringa-adler:tr04-09,
  type = {tr},
  author = {Brent Heeringa and Micah Adler},
  title = {\url{./04-09.pdf}{Optimal Website Design with the Constrained
           Subtree Selection Problem}},
  year = {2004},
  institution = {University of Massachusetts Amherst},
  number = {04-09},
  abstract = {We introduce the Constrained Subtree Selection (CSS) problem as a
model for the optimal design of websites.  Given a hierarchy of topics
represented as a DAG G and a probability distribution over the
topics, we select a subtree of the transitive closure of G which
minimizes the expected path cost.  We define path cost as the sum of
the page costs along a path from the root to a leaf.  Page cost,
gamma is a function of the number of links on a page.  We give a
sufficient condition for gamma which makes CSS NP-Complete.  This
result holds even for the uniform probability distribution.  We give a
polynomial time algorithm for instances of CSS where G does not
constrain the choice of subtrees and gamma favors pages with at
most k links.  We show that CSS remains NP-Hard for constant degree
DAGs and a class of degree costs and give an O(log(k)gamma(d+1))
approximation for any gamma favoring pages with at most k links
and for any G with maximum degree d.  We give complete
characterizations of the optimal trees for the linear degree cost in
unconstrained graphs and uniform probability distributions, and the
logarithmic degree cost in arbitrary DAGs and uniform probability
distributions.}
}
@techreport{heeringa-oates:tr01-21,
  type = {tr},
  author = {Brent Heeringa and Tim Oates},
  title = {\url{./01-21.pdf}{Incrementally Learning the Parameters of 
           Stochastic Context-Free Grammars using Summary Statistics 
           and Repeated Sampling}},
  institution = {University of Massachusetts Amherst},
  number = {01-21},
  year = {2001},
  abstract = {We are interested in how robots might learn language from
exposure to utterances and sensory information about the physical
contexts in which they occur. Although stochastic context free
grammars are often used to represent the syntactic structure of
natural languages, most algorithms for learning them from data require
repeated processing of a corpus of sentences. The memory and
computational demands of such algorithms are ill-suited for embedded
agents. We present an online algorithm that uses summary statistics
computed from sentences in combination with repeated sampling
techniques to learn the parameters of stochastic context free
grammars. Empirical results demonstrate that the algorithm performs
just as well as the Inside-Outside algorithm de- spite the fact that
it has access to less information about the training data.}
}