using System;
using System.Collections.Generic;
using System.IO;
using Latino;

namespace SvmPosTagger
{
    /* .-----------------------------------------------------------------------
       |
       |  Class WordDictionary
       |
       '-----------------------------------------------------------------------
    */
    public class WordDictionary : ISerializable
    {
        private Dictionary<string, Set<string>> m_dictionary
            = new Dictionary<string, Set<string>>();
        private Set<string> m_hidden_words
            = new Set<string>();
        private Set<string> m_unknown_word_tags
            = new Set<string>();
        internal WordDictionary()
        { 
        }
        public WordDictionary(BinarySerializer reader)
        {
            Utils.ThrowException(reader == null ? new ArgumentNullException("reader") : null);
            Load(reader); // throws serialization-related exceptions
        }
        internal void SetHiddenWords(Set<string> hidden_words)
        {
            m_hidden_words = hidden_words;
            foreach (string hidden_word in m_hidden_words)
            {
                m_unknown_word_tags.AddRange(m_dictionary[hidden_word]); 
            }
        }
        internal void AddWordTagPair(string word, string tag)
        {
            if (m_dictionary.ContainsKey(word))
            {
                m_dictionary[word].Add(tag);
            }
            else
            {
                m_dictionary.Add(word, new Set<string>(new string[] { tag }));
            }            
        }
        public Set<string>.ReadOnly UnknownWordTags
        {
            get { return m_unknown_word_tags; }
        }
        public bool Contains(string word)
        {
            Utils.ThrowException(word == null ? new ArgumentNullException("word") : null);
            return m_dictionary.ContainsKey(word);
        }
        public Set<string>.ReadOnly GetTags(string word)
        {
            Utils.ThrowException(word == null ? new ArgumentNullException("word") : null);
            if (!m_dictionary.ContainsKey(word)) { return m_unknown_word_tags; }
            return m_dictionary[word];
        }
        public string[] GetWords()
        {
            return new ArrayList<string>(m_dictionary.Keys).ToArray();
        }
        private string GetAmbiguityClass(Set<string> tag_class)
        {
            if (tag_class.Count == 0) { return ""; }
            ArrayList<string> tags_sorted = new ArrayList<string>(tag_class);
            tags_sorted.Sort();
            string ambiguity_class = tags_sorted[0];
            for (int i = 1; i < tags_sorted.Count; i++)
            {
                ambiguity_class += string.Format("_{0}", tags_sorted[i]);
            }
            return ambiguity_class;
        }
        public string GetAmbiguityClass(string word)
        {
            Utils.ThrowException(word == null ? new ArgumentNullException("word") : null);
            Set<string> tag_class = m_dictionary.ContainsKey(word) ? m_dictionary[word] : m_unknown_word_tags;
            return GetAmbiguityClass(tag_class);
        }
        public string GetUnknownWordAmbiguityClass()
        {
            return GetAmbiguityClass(m_unknown_word_tags);
        }
        public bool IsHiddenWord(string word)
        {
            Utils.ThrowException(word == null ? new ArgumentNullException("word") : null);
            return m_hidden_words.Contains(word);
        }
        // *** ISerializable interface implementation ***
        public void Save(BinarySerializer writer)
        {
            Utils.ThrowException(writer == null ? new ArgumentNullException("writer") : null);
            ArrayList<Pair2<string, Set<string>>> tmp = new ArrayList<Pair2<string, Set<string>>>();
            foreach (KeyValuePair<string, Set<string>> word_info in m_dictionary)
            {
                tmp.Add(new Pair2<string, Set<string>>(word_info.Key, word_info.Value));
            }
            tmp.Save(writer);
            m_hidden_words.Save(writer);
            m_unknown_word_tags.Save(writer);
        }
        private void Load(BinarySerializer reader)
        {            
            ArrayList<Pair2<string, Set<string>>> tmp = new ArrayList<Pair2<string, Set<string>>>(reader);
            foreach (Pair2<string, Set<string>> word_info in tmp)
            {
                m_dictionary.Add(word_info.First, word_info.Second);
            }
            m_hidden_words = new Set<string>(reader);
            m_unknown_word_tags = new Set<string>(reader);
        }
    }

    /* .-----------------------------------------------------------------------
       |
       |  Class TaggedWord
       |
       '-----------------------------------------------------------------------
    */
    public class TaggedWord
    {
        private string m_word;
        private string m_tag;
        public TaggedWord(string word, string tag)
        {
            Utils.ThrowException(word == null ? new ArgumentNullException("word") : null);
            m_word = word;
            m_tag = tag;
        }
        public TaggedWord(string word) : this(word, /*tag=*/null) // throws ArgumentNullException
        {
        }
        public string WordL
        {
            get { return m_word.ToLower(); }
        }
        public string Word
        {
            get { return m_word; }
        }
        public string Tag
        {
            get { return m_tag; }
            set { m_tag = value; }
        }
    }

    /* .-----------------------------------------------------------------------
       |
       |  Class Corpus
       |
       '-----------------------------------------------------------------------
    */
    public class Corpus
    {
        private ArrayList<TaggedWord> m_tagged_words // array of word-tag pairs
            = new ArrayList<TaggedWord>();
        public ArrayList<TaggedWord>.ReadOnly TaggedWords
        {
            get { return m_tagged_words; }
        }
        public WordDictionary LoadFromFile(string file_name, bool create_dictionary)
        {
            Utils.ThrowException(file_name == null ? new ArgumentNullException("file_name") : null);
            Utils.ThrowException(!Utils.VerifyFileName(file_name, /*must_exist=*/true) ? new InvalidArgumentValueException("file_name") : null);
            WordDictionary dictionary = create_dictionary ? new WordDictionary() : null;
            m_tagged_words.Clear();
            StreamReader reader = new StreamReader(file_name);
            string line;
            while ((line = reader.ReadLine()) != null)
            {                
                string[] tagged_word = line.Split(new string[] { " ", "\t" }, StringSplitOptions.RemoveEmptyEntries);
                if (tagged_word.Length >= 1)
                {
                    string word = tagged_word[0];
                    string tag = tagged_word.Length > 1 ? tagged_word[1] : null;
                    m_tagged_words.Add(new TaggedWord(word, tag));
                    if (create_dictionary) { dictionary.AddWordTagPair(m_tagged_words.Last.WordL, tag); } 
                }
            }
            if (create_dictionary)
            {
                Utils.ThrowException(m_tagged_words.Count < 10 ? new InvalidOperationException() : null);
                Set<string> hidden_words = GetHiddenWords(/*num_folds=*/10); // *** 10 folds are used to determine hidden words
                dictionary.SetHiddenWords(hidden_words);
            }
            return dictionary;
        }
        public void SaveIntoFile(string file_name)
        {
            Utils.ThrowException(file_name == null ? new ArgumentNullException("file_name") : null);
            Utils.ThrowException(!Utils.VerifyFileName(file_name, /*must_exist=*/false) ? new InvalidArgumentValueException("file_name") : null);
            StreamWriter writer = new StreamWriter(file_name);
            foreach (TaggedWord tagged_word in m_tagged_words)
            {
                writer.WriteLine("{0}\t{1}", tagged_word.Word, tagged_word.Tag);
            }
            writer.Close();
        }
        private static void AddFeature(string feature_name, Dictionary<string, int> feature_space, bool extend_feature_space, SparseVector2<double> feature_vector)
        {
            if (feature_space.ContainsKey(feature_name))
            { 
                feature_vector[feature_space[feature_name]] = 1.0;
            }
            else if (extend_feature_space)
            {
                feature_vector[feature_space.Count] = 1.0;
                feature_space.Add(feature_name, feature_space.Count);
            }
        }
        private static string GetSuffix(string word, int n)
        {
            if (word.Length <= n) { return word; }
            return word.Substring(word.Length - n);
        }
        private static string GetPrefix(string word, int n)
        {
            if (word.Length <= n) { return word; }
            return word.Substring(0, n);
        }
        public SparseVector2<double> GenerateFeatureVector(int word_idx, Dictionary<string, int> feature_space, bool extend_feature_space, WordDictionary dictionary)
        {
            Utils.ThrowException((word_idx < 0 || word_idx >= m_tagged_words.Count) ? new ArgumentOutOfRangeException("word_idx") : null);
            Utils.ThrowException(dictionary == null ? new ArgumentNullException("dictionary") : null);
            SparseVector2<double> feature_vector = new SparseVector2<double>();
            for (int offset = -3; offset <= 3; offset++) // consider context of 3 + 1 + 3 words
            {
                int idx = word_idx + offset;
                // *** unigrams ***
                if (idx >= 0 && idx < m_tagged_words.Count)
                {
                    AddFeature(string.Format("w({0}) {1}", offset, m_tagged_words[idx].WordL), feature_space, extend_feature_space, feature_vector);
                    for (int i = 1; i <= 4; i++) // consider prefixes and suffixes of up to 4 letters
                    {
                        string prefix = GetPrefix(m_tagged_words[idx].WordL, i);
                        AddFeature(string.Format("p{0}({1}) {2}", i, offset, prefix), feature_space, extend_feature_space, feature_vector);
                        string suffix = GetSuffix(m_tagged_words[idx].WordL, i);
                        AddFeature(string.Format("s{0}({1}) {2}", i, offset, suffix), feature_space, extend_feature_space, feature_vector);
                    }
                    if (offset < 0) // tag is available iff offset < 0
                    {
                        AddFeature(string.Format("t({0}) {1}", offset, m_tagged_words[idx].Tag), feature_space, extend_feature_space, feature_vector);
                    }
                    else // tag not available; use "maybe" features and ambiguity class instead
                    {
                        string word = m_tagged_words[idx].WordL;
                        Set<string>.ReadOnly tags = dictionary.IsHiddenWord(word) ? dictionary.UnknownWordTags : dictionary.GetTags(word);
                        foreach (string tag in tags)
                        {
                            AddFeature(string.Format("m({0}) {1}", offset, tag), feature_space, extend_feature_space, feature_vector);
                        }
                        string ambiguity_class = dictionary.IsHiddenWord(word) ? dictionary.GetUnknownWordAmbiguityClass() : dictionary.GetAmbiguityClass(word);
                        AddFeature(string.Format("t({0}) {1}", offset, ambiguity_class), feature_space, extend_feature_space, feature_vector);
                    }
                }
            }
            // *** bigrams and trigrams ***
            for (int n = 2; n <= 3; n++)
            {
                for (int offset = -2; offset <= 3 - n; offset++) // consider 4 bigrams and 3 trigrams
                {
                    string word_feature = string.Format("w({0},{1})", n, offset);
                    string tag_feature = string.Format("t({0},{1})", n, offset);
                    string[] prefix_feature = new string[4];
                    string[] suffix_feature = new string[4];
                    for (int i = 0; i < 4; i++) // consider prefixes and suffixes of up to 4 letters
                    {
                        prefix_feature[i] = string.Format("p{0}({1},{2})", i, n, offset);
                        suffix_feature[i] = string.Format("s{0}({1},{2})", i, n, offset);
                    }
                    if (word_idx + offset >= 0 && word_idx + offset + (n - 1) < m_tagged_words.Count)
                    {
                        for (int i = 0; i < n; i++)
                        {
                            int idx = word_idx + offset + i;
                            string word = m_tagged_words[idx].WordL;
                            word_feature += " " + word;
                            for (int j = 0; j < 4; j++) // prefixes and suffixes
                            { 
                                prefix_feature[j] += " " + GetPrefix(word, j);
                                suffix_feature[j] += " " + GetSuffix(word, j);
                            }
                            if (offset + i < 0) // tag is available iff offset + i < 0
                            {
                                tag_feature += " " + m_tagged_words[idx].Tag;
                            }
                            else // tag not available; use ambiguity class instead
                            {
                                string ambiguity_class = dictionary.IsHiddenWord(word) ? dictionary.GetUnknownWordAmbiguityClass() : dictionary.GetAmbiguityClass(word);
                                tag_feature += " " + ambiguity_class;
                            }
                        }
                        AddFeature(word_feature, feature_space, extend_feature_space, feature_vector);
                        AddFeature(tag_feature, feature_space, extend_feature_space, feature_vector);
                        for (int i = 0; i < 4; i++) // add prefix and suffix features
                        {
                            AddFeature(prefix_feature[i], feature_space, extend_feature_space, feature_vector);
                            AddFeature(suffix_feature[i], feature_space, extend_feature_space, feature_vector);
                        }
                    }
                }
            }           
            return feature_vector;
        }
        private Set<string> GetHiddenWords(int num_folds)
        {
            Set<string> hidden_words = new Set<string>();
            Set<string>[] folds = new Set<string>[num_folds];            
            for (int i = 0; i < folds.Length; i++) { folds[i] = new Set<string>(); }
            int words_per_fold = (int)Math.Floor((double)m_tagged_words.Count / (double)num_folds);
            for (int i = 0; i < m_tagged_words.Count; i++)
            {
                int fold_idx = Math.Min((int)Math.Floor((double)i / (double)words_per_fold), folds.Length - 1);
                folds[fold_idx].Add(m_tagged_words[i].WordL);
            }
            for (int i = 0; i < folds.Length; i++)
            {
                foreach (string word in folds[i])
                {
                    bool is_hidden_word = true;
                    for (int j = 0; j < folds.Length; j++)
                    {
                        if (i != j)
                        {
                            if (folds[j].Contains(word))
                            {
                                is_hidden_word = false;
                                break;
                            }
                        }
                    }
                    if (is_hidden_word)
                    {
                        hidden_words.Add(word);
                    }
                }
            }
            return hidden_words;
        }
    }
}
