import nltk from estnltk import Text import urllib.request from collections import Counter A2 = Text(urllib.request.urlopen("http://www.tlu.ee/~kais/Digihum_tehnoloogiad/Nadal6/A2_2018_I.txt").read().decode("utf8").lower()).postags B1 = Text(urllib.request.urlopen("http://www.tlu.ee/~kais/Digihum_tehnoloogiad/Nadal6/B1_2018_I.txt").read().decode("utf8").lower()).postags B1 = B1[0: len(A2)] pikkus = 2 A2paarid = ["-".join(A2[arv:arv+pikkus]) for arv in range(len(A2)-(pikkus-1))] B1paarid = ["-".join(B1[arv:arv+pikkus]) for arv in range(len(B1)-(pikkus-1))] def tunnused(paar): return{'sõnaliigipaar': paar} andmed = [[tunnused(paar), 'A2'] for paar in A2paarid] andmed+= [[tunnused(paar), 'B1'] for paar in B1paarid] mudel = nltk.NaiveBayesClassifier.train(andmed) print("Model Accuracy") print(nltk.classify.accuracy(mudel, andmed)) mudel.show_most_informative_features(20) A2_2 = Text(urllib.request.urlopen("http://www.tlu.ee/~kais/Digihum_tehnoloogiad/Nadal6/A2_2018_II.txt").read().decode("utf8").lower()).postags A2_paarid2 = ["-".join(A2_2[arv:arv+pikkus]) for arv in range(len(A2_2)-(pikkus-1))] print("Classification of New Text") print(Counter([mudel.classify(tunnused(paar)) for paar in A2_paarid2]))