PascalZhan's picture
Synchronisation Frontend DracolIA QA
c9b1bae
import fitz
from docx import Document
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import pandas as pd
from Exceptions.FileTypeIsNotAcceptedException import FileTypeIsNotAcceptedException
from googletrans import Translator
import PyPDF2
from io import BytesIO
class Service_File:
def __init__(self):
pass
def file_for_string(self, file):
translator = Translator()
if file.name.endswith('.docx'):
print("File is a docx")
string = self.word_to_string(file)
elif file.name.endswith('.pdf'):
print("File is a pdf")
string = self.pdf_to_string(file)
elif file.name.endswith('.xlsx'):
print("File is an .xlsx")
string = self.excel_to_string(file)
elif file.name.endswith('.csv'):
print("File is a .csv")
string = self.csv_to_string(file)
else:
raise FileTypeIsNotAcceptedException('File type is not accepted. Please upload a .docx, .pdf, .xlsx or .csv file.')
string = string.replace('\n', ' ').replace('\t', ' ').replace('"', ' ').replace("'", ' ')
split = self.split_text(string)
print(len(split))
translate = ""
for i in range(len(split)):
print(i)
language = translator.detect(str(split[i])).lang.upper() # Verify the language of the prompt
if split[i] != "" or len(split[i]) != 0:
if language != "EN":
translate = translate + translator.translate(str(split[i]), src=language, dest="EN").text
else:
translate = translate + split[i]
print("translate: ", translate)
return translate
def split_text(self, text, max_chars=1500):
if len(text) <= max_chars:
return [text]
split_texts = []
current_text = ""
words = text.split()
for word in words:
if len(current_text) + len(word) + 1 <= max_chars:
current_text += word + " "
else:
split_texts.append(current_text)
current_text = word + " "
split_texts.append(current_text)
return split_texts
def pdf_to_string(self, file):
pdf_data = file.read()
pdf_document = PyPDF2.PdfReader(BytesIO(pdf_data))
text = ""
for page_number in range(len(pdf_document.pages)):
text += pdf_document.pages[page_number].extract_text()
return text
def word_to_string(self, file):
doc = Document(file)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return '\n'.join(full_text)
def excel_to_string(self, file):
df = pd.read_excel(file)
return self.dataframe_to_formatted_string(df)
def csv_to_string(self, file):
df = pd.read_csv(file)
return self.dataframe_to_formatted_string(df)
def dataframe_to_formatted_string(self, df):
formatted_string = ', '.join(df.columns) + '\n'
for index, row in df.iterrows():
line_values = [str(value) for value in row]
formatted_string += ', '.join(line_values) + '\n'
return formatted_string.strip()