# By: Riasat Ullah
# This file represents an object that can be used to transform and process a list of strings.
# It can also provide similarity factors of the strings in the list.

import numpy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from num2words import num2words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from validations import string_validator


class TextProcessor(object):

    def __init__(self, corpus):
        '''
        Constructor
        :param corpus: (list) of str[n-grams]
        '''
        self.corpus = corpus

    def de_capitalize(self):
        '''
        Convert all strings to lower case.
        '''
        self.corpus = numpy.char.lower(self.corpus)

    def remove_punctuations(self):
        '''
        Remove punctuations from strings.
        '''
        symbols = "!\"#$%&()*+-./:;,<=>?@[\]^_`{|}~\n"
        for i in symbols:
            self.corpus = numpy.char.replace(self.corpus, i, '')

    def remove_apostrophe(self):
        '''
        Remove apostrophe from strings.
        '''
        self.corpus = numpy.char.replace(self.corpus, "'", '')

    def remove_stopwords(self, lang='english'):
        '''
        Remove all stop words.
        :param lang: language
        '''
        current_stopwords = stopwords.words(lang)
        new_corpus = []
        for item in self.corpus:
            item_list = item.split(' ')
            new_corpus.append(' '.join(list(filter(lambda x: x not in current_stopwords, item_list))))
        self.corpus = new_corpus

    def convert_numbers_to_words(self):
        '''
        Convert numbers to words.
        '''
        new_corpus = []
        for item in self.corpus:
            item_list = item.split(' ')
            new_corpus.append(' '.join(
                [num2words(sub_item) if string_validator.is_number(sub_item)
                 else sub_item for sub_item in item_list]
            ))
        self.corpus = new_corpus

    def lemmatize(self):
        '''
        Lemmatize all strings in the corpus.
        '''
        lemmatizer = WordNetLemmatizer()
        new_corpus = []
        for item in self.corpus:
            item_list = item.split(' ')
            new_corpus.append(' '.join(list(map(lambda x: lemmatizer.lemmatize(x), item_list))))
        self.corpus = new_corpus

    def stem(self):
        '''
        Use a stemmer to shorten words in all the strings.
        '''
        stemmer = PorterStemmer()
        new_corpus = []
        for item in self.corpus:
            item_list = item.split(' ')
            new_corpus.append(' '.join(list(map(lambda x: stemmer.stem(x), item_list))))
        self.corpus = new_corpus

    def default_pre_process(self):
        '''
        Pre-process all strings. This is the default pre-processor.
        '''
        self.de_capitalize()
        self.remove_punctuations()
        self.remove_apostrophe()
        self.convert_numbers_to_words()
        self.remove_stopwords()
        self.lemmatize()

    def get_similarity_matrix(self, search_words=None):
        '''
        Get the full matrix of the cosine values calculated using tf-idf vectors.
        :param search_words: (list) of words to set as the vocabulary for TF-IDF calculations.
        :return: (list) of list of int
        '''
        # If we do not ensure that the search words are unique we will get this error -
        # ValueError: Duplicate term in vocabulary: '{ term }'
        if search_words is not None:
            search_words = list(set(search_words))

        vectorizer = TfidfVectorizer(vocabulary=search_words)
        vectorizer.fit(self.corpus)
        return cosine_similarity(vectorizer.transform(self.corpus).toarray())

    def get_similarity_scores(self, search_words=None, base_index=0):
        '''
        Get the cosine similarity values with respect to a particular string.
        :param search_words: (vocabulary) list of words
        :param base_index: (int) index of the base string
        :return: (list) of cosine similarity
        '''
        matrix = self.get_similarity_matrix(search_words)
        return matrix[base_index]