#! python3

#This program looks at text in a text document and compares each word with a
#dictionary list in an Excel spreadsheet.  It adds all new words to the
#spreadsheet along with example sentences.

#Limitations:  On occasion examples don't match well.
#              Text document doesn't handle complex formatting very well
#              Program assumes that orthographic words are found in Excecl in column B and examples in Column E.  You can change this if you like
#              Currently set to work with orthographic text since Tok Pisin has an orthography already, you can change this

#What to change to make it work on another computer:    (Mac users must change line 1 to #! /usr/bin/env python3
# Folder Dictionary is located in on line 21
# Location of Excel dictionary on line 22
# Location of text file to read texts from on line 36


#Task 1 Open Excel spreadsheet and store values
import openpyxl, os, codecs

os.chdir('C:\\Users\\Steve\\Documents\\Computer stuff\\Tok Pisin Dictionary')
wb = openpyxl.load_workbook('C:\\Users\\Steve\\Documents\\Computer stuff\\Tok Pisin Dictionary\\Dictionary.xlsx')
sheet = wb.get_sheet_by_name('Sheet1')
knownWords = []

for i in range(sheet.get_highest_row()):
    knownWords.append(sheet.cell(row=(i+2),column=2).value)
    #if knownWords[i] == None:            #These lines of code were used to delete None values, however this stopped the program working if you deleted stuff from the Excel dictionary beforehand
        #del knownWords[i]                #No loss in functionality removing this code, None values enable deleting in Excel to occur 
        #i -= 1
    #knownWords[i]=knownWords[i].lower()
    

#Task 2 Open Text document and store values

NewWrdFile = codecs.open('C:\\Users\\Steve\\Documents\\Computer stuff\\Tok Pisin Dictionary\\AddToDictionary.txt', encoding='utf-8')
NewText = NewWrdFile.read()
NewText = NewText.lower()


Sentences = NewText.split('.')
try:
    Sentences.remove(' ')
except ValueError:
    None

try:
    Sentences.remove('\n')
except ValueError:
    None



print('')

for i in range(len(Sentences)):
    Sentences[i] = Sentences[i].lstrip()
    Sentences[i] = Sentences[i] + '.'
        
def SplitAt(Character):
    global Sentences
    
    Qdel=[]                         #Gives the indices of question clauses to delete
    for i in range(len(Sentences)):
        if Character in Sentences[i]:
            Qdel.append(Sentences[i])
        
    Q=['']*len(Qdel)                #Saves Question clauses
    for i in range(len(Qdel)):      
        Q[i]=Qdel[i]                #Q is a 2 item list, contains 4 clauses

    for i in range(len(Qdel)):      #Deletes Question clauses from Sentences
        Sentences.remove(Qdel[i])
    
    q=['']*len(Q)                   #Splits Q into a 2 item embedded list
    for i in range(len(Q)):
        q[i] = Q[i].split(Character)      #q is a 2 item list of 2 item lists

    Qsentences = []    
    for i in range(len(q)):         #adds ? back to string
        q[i][0] += Character
        for j in range(2):
            q[i][j]=q[i][j].lstrip()
            Qsentences.append(q[i][j])

    Sentences =  Sentences + Qsentences

SplitAt('?')
SplitAt('!')
    
Sentences.sort(key=len)   
#Task 3 Identify new words in Word document

Words = NewText.split(' ')

for i in range(len(Words)):
    Words[i] = Words[i].strip(',')
    Words[i] = Words[i].strip('.')
    Words[i] = Words[i].strip('!')
    Words[i] = Words[i].strip('?')
    Words[i] = Words[i].strip('\n')
    Words[i] = Words[i].strip('\'')
    Words[i] = Words[i].strip('.')
    Words[i] = Words[i].strip('"')
    Words[i] = Words[i].strip(',')


for i in range(len(Words)):    
    try:
        if Words[i] == '':
            del Words[i]
    except IndexError:
        None
        
NewWords = []
for i in range(len(Words)):
    if Words[i] not in knownWords:
        NewWords.append(Words[i])
        
NewWords.sort()

for i in range(len(NewWords)):
    NewWords[i] = NewWords[i].strip('?')

try:
    for i in range(len(NewWords)):
        while NewWords[i] == NewWords[i+1]:
            del NewWords[i+1]
except IndexError:
    None
    
                            
#Task 4 Add new words with examples to Spreadsheet

        
FirstEmptyRow=2
while True:         
    if (sheet.cell(row=(FirstEmptyRow),column=2).value) != None:    #Find out where to start inputting data
        FirstEmptyRow += 1  
    else:
        break

 
for i in range(len(NewWords)):      #Generate correct cell reference
    Wrdcell = 'B' + str(i+FirstEmptyRow)
    sheet[Wrdcell]= NewWords[i]
    ExampleSentence = []
    Sencell = 'E' + str(i+FirstEmptyRow)

    for j in range(len(Sentences)):
        if NewWords[i] in Sentences[j]:
            ExampleSentence.append(Sentences[j])

    for j in range(len(ExampleSentence)):            
        if ' ' + NewWords[i] + ' ' in ExampleSentence[j]:
            EX = j
            break
        elif NewWords[i] + ' ' in ExampleSentence[j]:
            EX = j
        elif ' ' + NewWords[i] in ExampleSentence[j]:
            EX = j
        else:
            NewWords[i] in ExampleSentence[j]
            EX = j
         
    try:                           
        sheet[Sencell]= ExampleSentence[EX]

    except IndexError:
        None
    
        


wb.save('UpdatedDictionary.xlsx')