deprecated
Browse files
utils.py
CHANGED
|
@@ -1,19 +1,19 @@
|
|
| 1 |
import os
|
| 2 |
import pandas as pd
|
| 3 |
-
from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
|
| 4 |
from PyPDF2 import PdfReader
|
|
|
|
| 5 |
|
| 6 |
def pdf_to_text(pdf_path):
|
| 7 |
text = ""
|
| 8 |
with open(pdf_path, "rb") as file:
|
| 9 |
reader = PdfReader(file)
|
| 10 |
-
for page_num in range(reader.
|
| 11 |
-
page = reader.
|
| 12 |
text += page.extract_text() + "\n"
|
| 13 |
return text
|
| 14 |
|
| 15 |
def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame:
|
| 16 |
-
db_path = "
|
| 17 |
models = ["sentence_transformer_multilingual", "sentence_transformer_multilingual_labse"]
|
| 18 |
model_name = models[0]
|
| 19 |
|
|
@@ -65,6 +65,6 @@ def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame:
|
|
| 65 |
|
| 66 |
df = pd.DataFrame(data)
|
| 67 |
return df
|
| 68 |
-
|
| 69 |
def save_to_excel(df, file_name: str):
|
| 70 |
df.to_excel(file_name, index=False)
|
|
|
|
| 1 |
import os
|
| 2 |
import pandas as pd
|
|
|
|
| 3 |
from PyPDF2 import PdfReader
|
| 4 |
+
from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, vis_helper
|
| 5 |
|
| 6 |
def pdf_to_text(pdf_path):
|
| 7 |
text = ""
|
| 8 |
with open(pdf_path, "rb") as file:
|
| 9 |
reader = PdfReader(file)
|
| 10 |
+
for page_num in range(len(reader.pages)):
|
| 11 |
+
page = reader.pages[page_num]
|
| 12 |
text += page.extract_text() + "\n"
|
| 13 |
return text
|
| 14 |
|
| 15 |
def align_text(txt1: str, txt2: str, lang1: str, lang2: str) -> pd.DataFrame:
|
| 16 |
+
db_path = "bilingualdata.db"
|
| 17 |
models = ["sentence_transformer_multilingual", "sentence_transformer_multilingual_labse"]
|
| 18 |
model_name = models[0]
|
| 19 |
|
|
|
|
| 65 |
|
| 66 |
df = pd.DataFrame(data)
|
| 67 |
return df
|
| 68 |
+
|
| 69 |
def save_to_excel(df, file_name: str):
|
| 70 |
df.to_excel(file_name, index=False)
|