Update utils.py
Browse files
utils.py
CHANGED
|
@@ -3,14 +3,68 @@ import pandas as pd
|
|
| 3 |
from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
|
| 4 |
from PyPDF2 import PdfReader
|
| 5 |
|
| 6 |
-
def pdf_to_text(pdf_path
|
| 7 |
-
|
| 8 |
text = ""
|
| 9 |
with open(pdf_path, "rb") as file:
|
| 10 |
-
reader =
|
| 11 |
-
for
|
|
|
|
| 12 |
text += page.extract_text() + "\n"
|
| 13 |
return text
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def save_to_excel(df, file_name: str):
|
| 16 |
df.to_excel(file_name, index=False)
|
|
|
|
| 3 |
from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
|
| 4 |
from PyPDF2 import PdfReader
|
| 5 |
|
| 6 |
+
def pdf_to_text(pdf_path):
|
|
|
|
| 7 |
text = ""
|
| 8 |
with open(pdf_path, "rb") as file:
|
| 9 |
+
reader = PdfFileReader(file)
|
| 10 |
+
for page_num in range(reader.numPages):
|
| 11 |
+
page = reader.getPage(page_num)
|
| 12 |
text += page.extract_text() + "\n"
|
| 13 |
return text
|
| 14 |
|
| 15 |
+
def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame:
|
| 16 |
+
db_path = "docsdata.db"
|
| 17 |
+
models = ["sentence_transformer_multilingual", "sentence_transformer_multilingual_labse"]
|
| 18 |
+
model_name = models[0]
|
| 19 |
+
|
| 20 |
+
txt1 = txt1.split("\n")
|
| 21 |
+
txt2 = txt2.split("\n")
|
| 22 |
+
|
| 23 |
+
text1_prepared = preprocessor.mark_paragraphs(txt1)
|
| 24 |
+
text2_prepared = preprocessor.mark_paragraphs(txt2)
|
| 25 |
+
splitted_from = splitter.split_by_sentences_wrapper(text1_prepared, lang1, leave_marks=True)
|
| 26 |
+
splitted_to = splitter.split_by_sentences_wrapper(text2_prepared, lang2, leave_marks=True)
|
| 27 |
+
|
| 28 |
+
if os.path.isfile(db_path):
|
| 29 |
+
os.unlink(db_path)
|
| 30 |
+
|
| 31 |
+
aligner.fill_db(db_path, lang1, lang2, splitted_from, splitted_to)
|
| 32 |
+
batch_ids = [0, 1]
|
| 33 |
+
|
| 34 |
+
aligner.align_db(db_path,
|
| 35 |
+
model_name,
|
| 36 |
+
batch_size=100,
|
| 37 |
+
window=40,
|
| 38 |
+
batch_ids=batch_ids,
|
| 39 |
+
save_pic=False,
|
| 40 |
+
embed_batch_size=10,
|
| 41 |
+
normalize_embeddings=True,
|
| 42 |
+
show_progress_bar=True)
|
| 43 |
+
|
| 44 |
+
conflicts_to_solve, rest = resolver.get_all_conflicts(db_path, min_chain_length=2, max_conflicts_len=6, batch_id=-1)
|
| 45 |
+
resolver.get_statistics(conflicts_to_solve)
|
| 46 |
+
resolver.get_statistics(rest)
|
| 47 |
+
|
| 48 |
+
steps = 3
|
| 49 |
+
batch_id = -1
|
| 50 |
+
|
| 51 |
+
for i in range(steps):
|
| 52 |
+
conflicts, rest = resolver.get_all_conflicts(db_path, min_chain_length=2 + i, max_conflicts_len=6 * (i + 1), batch_id=batch_id)
|
| 53 |
+
resolver.resolve_all_conflicts(db_path, conflicts, model_name, show_logs=False)
|
| 54 |
+
vis_helper.visualize_alignment_by_db(db_path, output_path="img_test1.png", lang_name_from=lang1, lang_name_to=lang2, batch_size=400, size=(600, 600), plt_show=True)
|
| 55 |
+
|
| 56 |
+
if len(rest) == 0:
|
| 57 |
+
break
|
| 58 |
+
|
| 59 |
+
paragraphs_from, paragraphs_to, meta = reader.get_paragraphs(db_path)
|
| 60 |
+
|
| 61 |
+
data = []
|
| 62 |
+
for from_paragraph, to_paragraph in zip(paragraphs_from, paragraphs_to):
|
| 63 |
+
for from_line, to_line in zip(from_paragraph, to_paragraph):
|
| 64 |
+
data.append({"From": from_line, "To": to_line})
|
| 65 |
+
|
| 66 |
+
df = pd.DataFrame(data)
|
| 67 |
+
return df
|
| 68 |
+
|
| 69 |
def save_to_excel(df, file_name: str):
|
| 70 |
df.to_excel(file_name, index=False)
|