Spaces:

Tech-di
/

WallTD-v.1

Sleeping

App Files Files Community

Feriel080 commited on Apr 21, 2025

Commit

72d2612

verified ·

1 Parent(s): 5f70924

fixes

Browse files

Files changed (1) hide show

utils.py +32 -174

utils.py CHANGED Viewed

@@ -1,24 +1,14 @@
 from pptx import Presentation
 import pdfplumber
 from reportlab.lib.pagesizes import letter
-from reportlab.pdfgen import canvas
-from io import BytesIO
-import docx
 from pathlib import Path
-import openpyxl
 import re
-from fastapi import UploadFile
-from docx import Document
 import pandas as pd
 import pdfplumber
-from docx import Document as DocxDocument
-from pptx.util import Inches, Pt
-from reportlab.lib.pagesizes import letter
-from reportlab.pdfgen import canvas
-import os
-from io import BytesIO
-from openpyxl import Workbook
 def extract_text(file_path: Path, file_type: str) -> str:
@@ -29,104 +19,68 @@ def extract_text(file_path: Path, file_type: str) -> str:
             text = f.read()
     elif file_type == "docx":
-        doc = docx.Document(file_path)
-        text = "\n".join([para.text for para in doc.paragraphs if para.text])
     elif file_type == "xlsx":
-        wb = openpyxl.load_workbook(file_path)
-        sheet = wb.active
-        for row in sheet.rows:
-            for cell in row:
-                if cell.value is not None:
-                    text += str(cell.value) + " "
     elif file_type == "pptx":
         prs = Presentation(file_path)
         for slide in prs.slides:
             for shape in slide.shapes:
                 if shape.has_text_frame:
-                    for paragraph in shape.text_frame.paragraphs:
-                        if (clean_text := paragraph.text.strip()):
-                            text += clean_text + "\n"
                 elif shape.has_table:
                     for row in shape.table.rows:
                         for cell in row.cells:
-                            if (cell_text := cell.text.strip()):
-                                text += cell_text + "\n"
     elif file_type == "pdf":
         with pdfplumber.open(file_path) as pdf:
-            text = "\n".join(
-                page.extract_text()
-                for page in pdf.pages
-                if page.extract_text()
-            )
-    return text.strip()
-def save_file(text: str, original_path: Path, file_type: str, output_path: Path):
     if file_type == "docx":
-        doc = docx.Document()
         doc.add_paragraph(text)
         doc.save(output_path)
     elif file_type == "xlsx":
-        wb = openpyxl.Workbook()
-        sheet = wb.active
-        text_lines = text.split(
-            "\n"
-        )
-        for i, line in enumerate(text_lines, start=1):
-            sheet.cell(row=i, column=1, value=line)
-        wb.save(output_path)
     elif file_type == "pptx":
         prs = Presentation()
         slide_layout = prs.slide_layouts[1]
-        max_lines = 25
         text_lines = text.split('\n')
-        chunks = []
-        current_chunk = []
-        for line in text_lines:
-            current_chunk.append(line)
-            if len(current_chunk) >= max_lines:
-                chunks.append('\n'.join(current_chunk))
-                current_chunk = []
-        if current_chunk:
-            chunks.append('\n'.join(current_chunk))
         for chunk in chunks:
             slide = prs.slides.add_slide(slide_layout)
             content = slide.shapes.placeholders[1]
             text_frame = content.text_frame
-            text_frame.clear()
-            paragraph = text_frame.add_paragraph()
-            paragraph.text = chunk
-            paragraph.font.size = Pt(13)
         prs.save(output_path)
     elif file_type == "pdf":
-         with open(output_path, "wb") as f:
-            pdf_buffer = BytesIO()
-            c = canvas.Canvas(pdf_buffer, pagesize=letter)
-            text_lines = text.split("\n")
-            y = 750
-            for line in text_lines:
-                c.drawString(72, y, line)
-                y -= 12
-                if y < 50:
-                    c.showPage()
-                    y = 750
-            c.save()
-            f.write(pdf_buffer.getvalue())
     else:
         with open(output_path, "w", encoding="utf-8") as f:
@@ -158,24 +112,17 @@ def verify_summary(summary: str, original: str) -> str:
     return '. '.join(verified) if verified else summary[:500]
 def ensure_complete_sentences(text: str) -> str:
-    """Guarantees proper sentence structure with robust error handling"""
     if not text or not isinstance(text, str):
         return ""
     try:
-        # Normalize whitespace
         text = ' '.join(text.split())
-        # Split on sentence boundaries
         sentences = re.split(r'(?<=[.!?])\s+', text)
-        # Filter and validate sentences
         valid_sentences = [
             s.strip() for s in sentences
             if s.strip() and s[-1] in {'.', '!', '?'}
         ]
-        # Reconstruct text with proper spacing
         reconstructed = ' '.join(valid_sentences)
         # Final safety check
@@ -193,93 +140,4 @@ def ensure_complete_sentences(text: str) -> str:
         return reconstructed
     except Exception:
-        return text
-async def convert_to_text(file: UploadFile) -> str:
-    file_extension = file.filename.split(".")[-1].lower()
-    content = await file.read()
-    if file_extension == "txt":
-        return content.decode("utf-8")
-    elif file_extension == "docx":
-        doc = Document(BytesIO(content))
-        return "\n".join([para.text for para in doc.paragraphs])
-    elif file_extension == "pptx":
-        ppt = Presentation(BytesIO(content))
-        text = []
-        for slide in ppt.slides:
-            for shape in slide.shapes:
-                if hasattr(shape, "text"):
-                    text.append(shape.text)
-        return "\n".join(text)
-    elif file_extension == "pdf":
-        with pdfplumber.open(BytesIO(content)) as pdf:
-            return "\n".join([page.extract_text() for page in pdf.pages])
-    elif file_extension in ["xlsx", "xls"]:
-        file_like = BytesIO(content)
-        df = pd.read_excel(file_like)
-        return df.to_string()
-    else:
-        raise ValueError(f"Unsupported file type: {file_extension}")
-# save translated text to a file
-def save_translated_file(translated_text: str, original_filename: str) -> str:
-    file_extension = os.path.splitext(original_filename)[-1].lower()
-    output_dir = "translated_files"
-    os.makedirs(output_dir, exist_ok=True)
-    output_filename = f"translated_{os.path.splitext(original_filename)[0]}{file_extension}"
-    output_file_path = os.path.join(output_dir, output_filename)
-    if file_extension == ".docx":
-        doc = DocxDocument()
-        doc.add_paragraph(translated_text)
-        doc.save(output_file_path)
-    elif file_extension == ".pdf":
-        with open(output_file_path, "wb") as f:  # create new pdf
-            pdf_buffer = BytesIO()
-            c = canvas.Canvas(pdf_buffer, pagesize=letter)
-            text_lines = translated_text.split("\n")
-            y = 750  # Position verticale initiale
-            for line in text_lines:
-                c.drawString(72, y, line)
-                y -= 12
-                if y < 50:
-                    c.showPage()
-                    y = 750
-            c.save()
-            f.write(pdf_buffer.getvalue())
-    elif file_extension == ".pptx":
-        prs = Presentation()
-        slide = prs.slides.add_slide(prs.slide_layouts[5])
-        left = top = width = height = Inches(1)
-        txBox = slide.shapes.add_textbox(left, top, width, height)
-        tf = txBox.text_frame
-        tf.text = translated_text
-        prs.save(output_file_path)
-    elif file_extension in [".xlsx", ".xls"]:
-        if file_extension == ".xlsx":
-            wb = Workbook()
-            ws = wb.active
-            text_lines = translated_text.split("\n")
-            for i, line in enumerate(text_lines, start=1):
-                ws.cell(row=i, column=1, value=line)
-            wb.save(output_file_path)
-        else:
-            df = pd.DataFrame([translated_text.split("\n")])
-            df.to_excel(output_file_path, index=False, header=False)
-    else:
-        output_filename = f"translated_{os.path.splitext(original_filename)[0]}.txt"
-        output_file_path = os.path.join(output_dir, output_filename)
-        with open(output_file_path, "w", encoding="utf-8") as f:
-            f.write(translated_text)
-    return output_file_path

 from pptx import Presentation
+from docx import Document
 import pdfplumber
 from reportlab.lib.pagesizes import letter
 from pathlib import Path
 import re
 import pandas as pd
 import pdfplumber
+from concurrent.futures import ThreadPoolExecutor
+from reportlab.platypus import SimpleDocTemplate, Paragraph
+from reportlab.lib.styles import getSampleStyleSheet
 def extract_text(file_path: Path, file_type: str) -> str:
             text = f.read()
     elif file_type == "docx":
+        doc = Document(file_path)
+        return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
     elif file_type == "xlsx":
+        df = pd.read_excel(file_path, engine="openpyxl")
+        return df.to_string(index=False, header=False).strip()
     elif file_type == "pptx":
         prs = Presentation(file_path)
+        text_parts = []
         for slide in prs.slides:
             for shape in slide.shapes:
                 if shape.has_text_frame:
+                    text_parts.append(shape.text_frame.text.strip())
                 elif shape.has_table:
                     for row in shape.table.rows:
                         for cell in row.cells:
+                            if cell.text.strip():
+                                text_parts.append(cell.text.strip())
+        return "\n".join(text_parts)
     elif file_type == "pdf":
         with pdfplumber.open(file_path) as pdf:
+            def extract_page(page):
+                return page.extract_text_simple() or ""
+            with ThreadPoolExecutor() as executor:
+                text_parts = list(executor.map(extract_page, pdf.pages))
+        return "\n".join(part for part in text_parts if part).strip()
+    return text
+def save_file(text: str, file_type: str, output_path: Path):
     if file_type == "docx":
+        doc = Document()
         doc.add_paragraph(text)
         doc.save(output_path)
     elif file_type == "xlsx":
+        df = pd.DataFrame(text.split("\n"), columns=["Content"])
+        df.to_excel(output_path, index=False, engine="xlsxwriter")
     elif file_type == "pptx":
         prs = Presentation()
         slide_layout = prs.slide_layouts[1]
         text_lines = text.split('\n')
+        chunks = [text_lines[i:i+25] for i in range(0, len(text_lines), 25)]
         for chunk in chunks:
             slide = prs.slides.add_slide(slide_layout)
             content = slide.shapes.placeholders[1]
             text_frame = content.text_frame
+            text_frame.clear()
+            text_frame.text = "\n".join(chunk)
         prs.save(output_path)
     elif file_type == "pdf":
+        doc = SimpleDocTemplate(str(output_path), pagesize=letter)
+        styles = getSampleStyleSheet()
+        flowables = [Paragraph(line, styles["Normal"]) for line in text.split("\n") if line.strip()]
+        doc.build(flowables)
     else:
         with open(output_path, "w", encoding="utf-8") as f:
     return '. '.join(verified) if verified else summary[:500]
 def ensure_complete_sentences(text: str) -> str:
     if not text or not isinstance(text, str):
         return ""
     try:
         text = ' '.join(text.split())
         sentences = re.split(r'(?<=[.!?])\s+', text)
         valid_sentences = [
             s.strip() for s in sentences
             if s.strip() and s[-1] in {'.', '!', '?'}
         ]
         reconstructed = ' '.join(valid_sentences)
         # Final safety check
         return reconstructed
     except Exception:
+        return text