Dictionary.js

/*Copyright (C) 2019-2022 Crawford Currie http://c-dot.co.uk*/

import { LetterNode } from "./LetterNode.js";

/**
 * Dictionary using a Directed Acyclic Word Graph (DAWG) in the
 * format generated by compress.js
 *
 * Note that the DAWG uses letter indices, and not actual characters, to
 * represent code points. To use this dictionary you also need an
 * alphabet of code points sorted in the same order as that used to
 * generate the DAWG.
 * @class Dictionary
 */
class Dictionary {

  /**
   * Cache of dictionaries
   * @member {Dictionary[]}
   * @private
   */
  static cache = [];

  /**
   * @param {string} name of the dictionary
   * @private
   */
  constructor(name) {
    /**
     * First node in the dictionary.
     * @member {LetterNode?}
     */
    this.root = undefined;

    /**
     * List of valid start points, such that at least one
     * start point must match() for any sequence of chars, or
     * there can't possibly be a word. Map from letter to a
     * LetterNode or a list of LetterNode.
     * @private
     */
    this.sequenceRoots = undefined;

    /**
     * List of valid start points, such that at least one
     * start point must match() for any sequence of chars,
     * or there can't possibly be a word.
     * @member {string}
     */
    this.name = name;
  }

  /**
   * Load a DAG, as generated by dictionary_compressor.js. This is
   * destructive; anything already in the dictionary will be
   * discarded.
   * @param {(Buffer|Array)?} data the DAWG data.
   * @return {Dictionary} this
   */
  loadDAWG(data) {
    const dv = new DataView(data);
    let index = 0;
    const numberOfNodes = dv.getUint32(4 * index++);
    const nodes = [];
    for (let i = 0; i < numberOfNodes; i++) {
      const letter = dv.getUint32(4 * index++);
      const node = new LetterNode(String.fromCodePoint(letter));
      node.decode(i, dv.getUint32(4 * index++));
      //console.log(`${nodes.length} `,node);
      nodes.push(node);
    }
    // Convert node indices to pointers
    for (let i = 0; i < nodes.length; i++) {
      const node = nodes[i];
      if (typeof node.next === "number")
        node.next = nodes[node.next];
      if (typeof node.child === "number")
        node.child = nodes[node.child];
    }
    this.root = nodes[0];

    return this;
  }

  /*
   * Cross-link nodes in the dictionary with nodes before and
   * after them, for fast traversal.
   * @return {Dictionary} this
   */
  addLinks() {
    // Build forward and back lists
    this.root.buildLists();
    return this;
  }

  /**
   * @callback Dictionary~wordCallback
   * @param {string} word Word found
   * @param {LetterNode} node Node where word was terminated
   */

  /**
   * Apply the callback to each of the words represented in the DAWG
   * (potentially huge!)
   * @param {Dictionary~wordCallback} callback function
   */
  eachWord(callback) {
    return this.root.eachWord("", callback);
  }

  /**
   * Return the LetterNode that matches the last character
   * in chars, starting from the root / first character.
   * @param {string} chars characters that may be the root of a word
   * @return {LetterNode} node found, or undefined
   */
  match(chars) {
    return this.root.match(chars, 0);
  }

  /**
   * Check if a word is in the dictionary
   * @param {string} chars a word to check
   * @return {boolean} true if the word is found, false otherwise
   */
  hasWord(chars) {
    const m = this.root.match(chars, 0);
    return m && m.isEndOfWord;
  }

  /**
   * Find anagrams of a set of letters. An anagram is defined as any
   * complete (2 or more characters) word that uses all or some of the
   * letters passed in.
   * @param {string} theChars the letters, ' ' for an any-letter wildcard.
   * @return {Object<string, string>} a map of actual words to the letter
   * sequence (using ' ' for blanks) that matched.
   */
  findAnagrams(theChars) {
    theChars = theChars.toUpperCase();

    if (theChars.length < 2)
      throw Error(`Dictionary: '${theChars}' is Too short to find anagrams`);

    // Sort the list of characters.
    // Sorting makes it easier to debug.
    const ac = theChars.split("");//.sort();

    //console.log('Sorted chars', ac);
    const foundWords = {};
    this.root.findWordsThatUse(ac, "", "", foundWords);
    return foundWords;
  }

  /** 
   * Find hangman matches for a set of letters. A hangman match is any
   * word(s) that match against an ordered set of letters, using a space
   * for an any-letter wildcard. For example, "EXAMPLE" is a hangman match
   * for "E AM LE".
   * @param {string} theChars the letters, ' ' for an any-letter wildcard.
   * @return {string[]} the list of words that matched.
   */
  findHangmen(theChars) {
    theChars = theChars.toUpperCase();
    const list = [];
    this.root.hangmen(theChars, 0, "", list);
    return list;
  }

  /**
   * For each letter of the alphabet, establish a list of valid
   * start points, such that at least one start point must match()
   * for any sequence of chars, or there can't possibly be a word.
   * @private
   */
  createSequenceRoots() {
    this.sequenceRoots = {};
    this.root.eachNode(node => {
      if (!this.sequenceRoots[node.letter])
        this.sequenceRoots[node.letter] = [node];
      else
        this.sequenceRoots[node.letter].push(node);
      return true;
    });
    //console.log(`Created sequence roots for dictionary "${this.name}"`);
  }

  /**
   * Get a list of the sequence roots for ch. The sequence roots
   * are all those nodes that represent the character in any word.
   * From a sequence root we can follow post or pre to extend the
   * word in either direction.
   * @param {string} ch character to find roots for
   * @return {LetterNode[]} list of the roots
   */
  getSequenceRoots(ch) {
    if (!this.sequenceRoots)
      this.createSequenceRoots();
    return this.sequenceRoots[ch] || [];
  }

  /**
   * Do the work of adding a word, but don't do anything about
   * pre-/post- links or sequence roots.
   * @private
   */
  _addWord(word) {
    /* istanbul ignore if */
    if (word.length === 0)
      return false;
    if (!this.root)
      this.root = new LetterNode(word.charAt(0));
    else if (this.hasWord(word))
      return false;
    this.root.add(word);
    return true;
  }

  /**
   * Add a word to the dictionary. No attempt is made at compression.
   * Note that previously retrieved sequence roots will no longer
   * be valid after the word is added and will need to be recomputed.
   * Note that we support single character words here, but
   * word games are limited to 2 letter or more. It's up to
   * the caller to enforce such constraints.
   * @return {boolean} true if the word needed to be added, false
   * if it was empty or already there.
   */
  addWord(word) {
    if (this._addWord(word)) {
      // Don't recreate, that will be done on demand
      delete this.sequenceRoots;
      // Re-build forward and back lists. This could be done
      // incrementally, but it's a reasonably cheap operation so....
      this.root.buildLists();
      return true;
    }
    return false;
  }

  /**
   * Find start node for the character sequence in the sequence
   * index i.e. it forms a valid sub-part of a word in the
   * dictionary. This way we can quickly eliminate sequences
   * such as "QX" which are never found in the dictionary. Note
   * that we don't have any way to reproduce the words that the
   * sequence is a valid part of; that's not the point, this is
   * intended to help eliminate invalid sequences when extending
   * a word backwards from a seed letter.
   * @param {string} seq letter sequence
   * @private
   */
  findSequence(seq) {
    if (!this.sequenceRoots)
      this.createSequenceRoots();
    const roots = this.sequenceRoots[seq.charAt(0)];
    if (!roots || roots.length <= 0)
      throw Error(`Dictionary: '${seq}' has no roots`);
    for (let root of roots) {
      if (root.match(seq, 0))
        return root;
    }
    // Not found
    return null;
  }

  /**
   * Return true if a start node for the character sequence is found
   * in the sequence index i.e. it forms a valid sub-part of a word
   * in the dictionary. This way we can quickly eliminate sequences
   * such as "QX" which are never found in the dictionary. Note that
   * we don't have any way to reproduce the words that the sequence
   * is a valid part of; that's not the point, this is intended to help
   * eliminate invalid sequences when extending a word backwards from
   * a seed letter.
   * @param {string} seq letter sequence
   * @return {boolean} if a start node exists
   */
  hasSequence(seq) {
    return this.findSequence(seq) != null;
  }
}

export { Dictionary }