Updated Tagger to better tokenize sentences, and also to fix bug
Browse files
Tagger.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
import re
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
from sentence_transformers.util import cos_sim
|
| 6 |
import cv2
|
|
|
|
| 7 |
|
| 8 |
model = SentenceTransformer('all-mpnet-base-v2')
|
| 9 |
|
|
@@ -44,19 +46,20 @@ def get_majority_author_sentence(paras_text):
|
|
| 44 |
all_sents = []
|
| 45 |
paras_inds = []
|
| 46 |
for (i, pt) in enumerate(paras_text):
|
| 47 |
-
sents = re.split('\.|:',
|
|
|
|
| 48 |
for s in sents:
|
| 49 |
all_sents.append(s.lower())
|
| 50 |
paras_inds.append(i)
|
| 51 |
sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
|
| 52 |
sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
|
| 53 |
kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
|
| 54 |
-
kw_pc_ind, kw_pc_score = keyword_match(['per curiam'], corpus=all_sents)
|
| 55 |
|
| 56 |
if sem_ind == kw_ind: # Definitely true
|
| 57 |
return all_sents[sem_ind], paras_inds[sem_ind]
|
| 58 |
if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
|
| 59 |
-
return all_sents[sem_pc_ind],
|
| 60 |
raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
|
| 61 |
|
| 62 |
def draw_line_above_sent(folderpath, sent, para_ind):
|
|
@@ -81,4 +84,4 @@ def process_file(folderpath, draw=False):
|
|
| 81 |
paras_text = get_paragraphed_text(folderpath)
|
| 82 |
majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
|
| 83 |
if draw:
|
| 84 |
-
draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)
|
|
|
|
| 1 |
+
# This file tags the major text
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
import re
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
from sentence_transformers.util import cos_sim
|
| 7 |
import cv2
|
| 8 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 9 |
|
| 10 |
model = SentenceTransformer('all-mpnet-base-v2')
|
| 11 |
|
|
|
|
| 46 |
all_sents = []
|
| 47 |
paras_inds = []
|
| 48 |
for (i, pt) in enumerate(paras_text):
|
| 49 |
+
# sents = re.split('\.|:',pt)
|
| 50 |
+
sents = sent_tokenize(pt)
|
| 51 |
for s in sents:
|
| 52 |
all_sents.append(s.lower())
|
| 53 |
paras_inds.append(i)
|
| 54 |
sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
|
| 55 |
sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
|
| 56 |
kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
|
| 57 |
+
kw_pc_ind, kw_pc_score = keyword_match(['per', 'curiam'], corpus=all_sents)
|
| 58 |
|
| 59 |
if sem_ind == kw_ind: # Definitely true
|
| 60 |
return all_sents[sem_ind], paras_inds[sem_ind]
|
| 61 |
if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
|
| 62 |
+
return all_sents[sem_pc_ind], paras_inds[sem_pc_ind]
|
| 63 |
raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
|
| 64 |
|
| 65 |
def draw_line_above_sent(folderpath, sent, para_ind):
|
|
|
|
| 84 |
paras_text = get_paragraphed_text(folderpath)
|
| 85 |
majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
|
| 86 |
if draw:
|
| 87 |
+
draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)
|