Spaces:
Build error
Build error
| import fitz | |
| from docx import Document | |
| from reportlab.pdfgen import canvas | |
| from reportlab.lib.pagesizes import letter | |
| import pandas as pd | |
| from Exceptions.FileTypeIsNotAcceptedException import FileTypeIsNotAcceptedException | |
| from googletrans import Translator | |
| import PyPDF2 | |
| from io import BytesIO | |
| class Service_File: | |
| def __init__(self): | |
| pass | |
| def file_for_string(self, file): | |
| translator = Translator() | |
| if file.name.endswith('.docx'): | |
| print("File is a docx") | |
| string = self.word_to_string(file) | |
| elif file.name.endswith('.pdf'): | |
| print("File is a pdf") | |
| string = self.pdf_to_string(file) | |
| elif file.name.endswith('.xlsx'): | |
| print("File is an .xlsx") | |
| string = self.excel_to_string(file) | |
| elif file.name.endswith('.csv'): | |
| print("File is a .csv") | |
| string = self.csv_to_string(file) | |
| else: | |
| raise FileTypeIsNotAcceptedException('File type is not accepted. Please upload a .docx, .pdf, .xlsx or .csv file.') | |
| string = string.replace('\n', ' ').replace('\t', ' ').replace('"', ' ').replace("'", ' ') | |
| split = self.split_text(string) | |
| print(len(split)) | |
| translate = "" | |
| for i in range(len(split)): | |
| print(i) | |
| language = translator.detect(str(split[i])).lang.upper() # Verify the language of the prompt | |
| if split[i] != "" or len(split[i]) != 0: | |
| if language != "EN": | |
| translate = translate + translator.translate(str(split[i]), src=language, dest="EN").text | |
| else: | |
| translate = translate + split[i] | |
| print("translate: ", translate) | |
| return translate | |
| def split_text(self, text, max_chars=1500): | |
| if len(text) <= max_chars: | |
| return [text] | |
| split_texts = [] | |
| current_text = "" | |
| words = text.split() | |
| for word in words: | |
| if len(current_text) + len(word) + 1 <= max_chars: | |
| current_text += word + " " | |
| else: | |
| split_texts.append(current_text) | |
| current_text = word + " " | |
| split_texts.append(current_text) | |
| return split_texts | |
| def pdf_to_string(self, file): | |
| pdf_data = file.read() | |
| pdf_document = PyPDF2.PdfReader(BytesIO(pdf_data)) | |
| text = "" | |
| for page_number in range(len(pdf_document.pages)): | |
| text += pdf_document.pages[page_number].extract_text() | |
| return text | |
| def word_to_string(self, file): | |
| doc = Document(file) | |
| full_text = [] | |
| for para in doc.paragraphs: | |
| full_text.append(para.text) | |
| return '\n'.join(full_text) | |
| def excel_to_string(self, file): | |
| df = pd.read_excel(file) | |
| return self.dataframe_to_formatted_string(df) | |
| def csv_to_string(self, file): | |
| df = pd.read_csv(file) | |
| return self.dataframe_to_formatted_string(df) | |
| def dataframe_to_formatted_string(self, df): | |
| formatted_string = ', '.join(df.columns) + '\n' | |
| for index, row in df.iterrows(): | |
| line_values = [str(value) for value in row] | |
| formatted_string += ', '.join(line_values) + '\n' | |
| return formatted_string.strip() | |