Spaces:
Build error
Build error
File size: 3,394 Bytes
c9b1bae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import fitz
from docx import Document
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import pandas as pd
from Exceptions.FileTypeIsNotAcceptedException import FileTypeIsNotAcceptedException
from googletrans import Translator
import PyPDF2
from io import BytesIO
class Service_File:
def __init__(self):
pass
def file_for_string(self, file):
translator = Translator()
if file.name.endswith('.docx'):
print("File is a docx")
string = self.word_to_string(file)
elif file.name.endswith('.pdf'):
print("File is a pdf")
string = self.pdf_to_string(file)
elif file.name.endswith('.xlsx'):
print("File is an .xlsx")
string = self.excel_to_string(file)
elif file.name.endswith('.csv'):
print("File is a .csv")
string = self.csv_to_string(file)
else:
raise FileTypeIsNotAcceptedException('File type is not accepted. Please upload a .docx, .pdf, .xlsx or .csv file.')
string = string.replace('\n', ' ').replace('\t', ' ').replace('"', ' ').replace("'", ' ')
split = self.split_text(string)
print(len(split))
translate = ""
for i in range(len(split)):
print(i)
language = translator.detect(str(split[i])).lang.upper() # Verify the language of the prompt
if split[i] != "" or len(split[i]) != 0:
if language != "EN":
translate = translate + translator.translate(str(split[i]), src=language, dest="EN").text
else:
translate = translate + split[i]
print("translate: ", translate)
return translate
def split_text(self, text, max_chars=1500):
if len(text) <= max_chars:
return [text]
split_texts = []
current_text = ""
words = text.split()
for word in words:
if len(current_text) + len(word) + 1 <= max_chars:
current_text += word + " "
else:
split_texts.append(current_text)
current_text = word + " "
split_texts.append(current_text)
return split_texts
def pdf_to_string(self, file):
pdf_data = file.read()
pdf_document = PyPDF2.PdfReader(BytesIO(pdf_data))
text = ""
for page_number in range(len(pdf_document.pages)):
text += pdf_document.pages[page_number].extract_text()
return text
def word_to_string(self, file):
doc = Document(file)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return '\n'.join(full_text)
def excel_to_string(self, file):
df = pd.read_excel(file)
return self.dataframe_to_formatted_string(df)
def csv_to_string(self, file):
df = pd.read_csv(file)
return self.dataframe_to_formatted_string(df)
def dataframe_to_formatted_string(self, df):
formatted_string = ', '.join(df.columns) + '\n'
for index, row in df.iterrows():
line_values = [str(value) for value in row]
formatted_string += ', '.join(line_values) + '\n'
return formatted_string.strip()
|