Spaces:

DracolIA
/

SAE_Question_Answering_IA_Front

Build error

File size: 3,394 Bytes

c9b1bae

import fitz
from docx import Document
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import pandas as pd
from Exceptions.FileTypeIsNotAcceptedException import FileTypeIsNotAcceptedException
from googletrans import Translator
import PyPDF2
from io import BytesIO

class Service_File:
    def __init__(self):
        pass

    def file_for_string(self, file):
        translator = Translator()
        
        if file.name.endswith('.docx'):
            print("File is a docx")
            string = self.word_to_string(file)
        
        elif file.name.endswith('.pdf'):
            print("File is a pdf")
            string = self.pdf_to_string(file)

        elif file.name.endswith('.xlsx'):
            print("File is an .xlsx")
            string = self.excel_to_string(file)
        
        elif file.name.endswith('.csv'):
            print("File is a .csv")
            string = self.csv_to_string(file)
        
        else:
            raise FileTypeIsNotAcceptedException('File type is not accepted. Please upload a .docx, .pdf, .xlsx or .csv file.')
        
        string = string.replace('\n', ' ').replace('\t', ' ').replace('"', ' ').replace("'", ' ')
        split = self.split_text(string)
        print(len(split))

        translate = ""
        for i in range(len(split)):
            print(i)
            language = translator.detect(str(split[i])).lang.upper() # Verify the language of the prompt
            if split[i] != "" or len(split[i]) != 0:
                if language != "EN":
                    translate = translate + translator.translate(str(split[i]), src=language, dest="EN").text
                else:
                    translate = translate + split[i]
        
        print("translate: ", translate)
        return translate


    def split_text(self, text, max_chars=1500):
        if len(text) <= max_chars:
            return [text]

        split_texts = []
        current_text = ""
        words = text.split()
        for word in words:
            if len(current_text) + len(word) + 1 <= max_chars:
                current_text += word + " "
            else:
                split_texts.append(current_text)
                current_text = word + " "
        split_texts.append(current_text)

        return split_texts
    

    def pdf_to_string(self, file):
        pdf_data = file.read()
        pdf_document = PyPDF2.PdfReader(BytesIO(pdf_data))
        text = ""
        for page_number in range(len(pdf_document.pages)):
            text += pdf_document.pages[page_number].extract_text()
        return text

    def word_to_string(self, file):
        doc = Document(file)
        full_text = []

        for para in doc.paragraphs:
            full_text.append(para.text)

        return '\n'.join(full_text)
    
    def excel_to_string(self, file):
        df = pd.read_excel(file)
        return self.dataframe_to_formatted_string(df)

    def csv_to_string(self, file):
        df = pd.read_csv(file)        
        return self.dataframe_to_formatted_string(df)

    def dataframe_to_formatted_string(self, df):
        formatted_string = ', '.join(df.columns) + '\n'
        
        for index, row in df.iterrows():
            line_values = [str(value) for value in row]
            formatted_string += ', '.join(line_values) + '\n'
        
        return formatted_string.strip()