import leven from "leven";
import { sanitize } from "./common";

export const nlpUtils = {
        
    tokenize: (str) => {
        return str
            .split(new RegExp("([^\\p{Alphabetic}\\p{Number}]+)", "u")) // splitting by all non-alphanumeric characters, unicode aware, the braces are used to keep the delimiter
            .filter(token => token.length > 0); // removing empty tokens
    },

    /**
     * Returns the given string separated into tokens excluding empty tokens and tokens containing only whitespaces.
     * @param {*} str 
     * @returns 
     */
    tokenizeWithoutWhitespaces: (str) => {
        return nlpUtils
            .tokenize(str)
            .filter(token => token.trim().length > 0) // remove tokens containing only whitespaces
            .map(token => token.trim()); // remove trailing and leading whitespaces
    },

    onlyAlphanumericTokens: (tokens) => {
        return tokens.filter(token => nlpUtils.isAlphanumeric(token));
    },

    isAlphanumeric: (str) => {
        return str.match(new RegExp("^[\\p{Alphabetic}\\p{Number}]+$", "u")) !== null; // is the whole string alphanumeric?, unicode aware
    },

    /**
     * Calculates a similarity score between two intents. 
     * Uses an efficient algorithm Levenshtein distance. It does not need word vectors and is super fast.
     * However, the similarity is calculated based only on matching characters in words. Synonyms are not taken into account.
     * For each word of A, the most similar word of B is matched. The same word of B cannot be matched twice.
     * @param {*} a An intent object with `text` attribute.
     * @param {*} b An intent object with `text` attribute.
     * @returns {number} A similarity score between 0 (a and b are totally different) and 1 (a and b are equal). 
     */
    intentSimilarity: (a, b) => {
        const aTokens = nlpUtils.tokenize(sanitize(a.text));
        const bTokens = nlpUtils.tokenize(sanitize(b.text));
        
        let aWords = nlpUtils.onlyAlphanumericTokens(aTokens);
        let bWords = nlpUtils.onlyAlphanumericTokens(bTokens);

        // always have more words in the B
        if (aWords.length > bWords.length) {
            const tmp = aWords;
            aWords = bWords;
            bWords = tmp;
        }

        // store the number of words in B (the longer input) before remove words from B
        const bLength = bWords.length;

        // for each word of A find the most similar word from B and possibly mark it as a common word
        const commonWords = aWords.filter(aWord => {
            const distances = bWords.map(bWord => leven(aWord, bWord)); // calculate levenshtein distance for word a to each word in b
            const minDistance = Math.min(...distances); // the minimal distance is the most similar word
            const minDistanceIndex = distances.indexOf(minDistance);
            
            if (minDistance < Math.min(aWord.length, 3)) { // if the minimal distance is less than 3, then the words are similar
                bWords.splice(minDistanceIndex, 1); // remove the word to not compare it again
                return true; // mark aWord as common
            }
        });

        // calculate the fraction of words common between A and B
        return commonWords.length / bLength;
    },

    /**
     * Returns the index of the first intent that is very similar to the given intent.
     * @param {*} intents Where to search for duplicates.
     * @param {*} baseIntent The given base intent used to comparison with items of `intents`.
     * @returns An index or -1.
     */
    findSimilarIntentIndex: (intents, baseIntent) => {
         return intents.findIndex(intent => 
             intent !== baseIntent && // prevent comparing the same intent with itself
             nlpUtils.intentSimilarity(baseIntent, intent) > 0.7
         );
    },

    hasSimilarIntent: (intents, baseIntent) => {
        const similarIntentIndex = nlpUtils.findSimilarIntentIndex(intents, baseIntent);
        return similarIntentIndex !== -1;
    },

    /**
     * Returns how many of the given intents are similar to others.
     * Each intent is compared only to previous ones to exclude duplicates.
     */
    numberOfSimilarIntents(intents) {
        return intents.reduce((a, v, i) => 
            nlpUtils.hasSimilarIntent(intents.slice(0, i), v) ? a + 1 : a, 
        0);
    },

    /**
     * Calculates the score representing the quality of the given intents (in percents, 0-100).
     * If the number of valid entered intents is lower than intentLimit, then the score is always 0%.
     * Otherwise, the score is the percent of filled intents out of 10%. 
     * If more than 10 intents are filled, the result is always 100%.
     * However, every intent which is similar to another one decreases the score by 5%.
     */
    intentQualityScore(intents, intentLimit = 3, optimalIntentCount = 10) {
        if (intents.length < intentLimit) {
            return 0;
        } else {
            const similarIntents = nlpUtils.numberOfSimilarIntents(intents);
            let percent = 100 * (intents.length - (similarIntents * 0.5)) / optimalIntentCount;
            return Math.min(percent, 100);
        }
    },

    intentQualityScoreColor(intents, intentLimit) {
        const score = nlpUtils.intentQualityScore(intents, intentLimit);

        if (score < 30) {
            return "red";
        } else if (score < 60) {
            return "orange";
        } else {
            return "green";
        }
    },

};