utils

Sleeping

App Files Files Community

not-lain commited on Aug 28, 2025

Commit

7bc78fb

1 Parent(s): 2ebbfbc

retire unoconv

Browse files

Files changed (1) hide show

base_utils.py +96 -40

base_utils.py CHANGED Viewed

@@ -242,41 +242,101 @@ def extract_text_from_pptx(file_path):
     return "\n\n".join(text_content)
-def extract_text_from_ppt(file_path):
-    try:
-        print("file_path = ", file_path)
-        # Convert PPT to PPTX using unoconv
-        pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
-        subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)
-        # Extract text from PPTX
-        presentation = Presentation(pptx_file_path)
-        text_content = []
-        for slide in presentation.slides:
-            slide_text = []
-            for shape in slide.shapes:
-                if hasattr(shape, "text"):
-                    slide_text.append(shape.text)
-            text_content.append("\n".join(slide_text))
-        # Remove the converted PPTX file
-        os.remove(pptx_file_path)
-        out = "\n\n".join(text_content)
-        return out
-    except Exception as e:
-        print(f"Error extracting text from PPT file: {e}")
-        return "Error extracting text from PPT file"
-# def extract_text_from_ppt_or_pptx(file_path):
-#     if file_path.endswith(".pptx"):
-#         return extract_text_from_pptx(file_path)
-#     elif file_path.endswith(".ppt"):
-#         return extract_text_from_ppt(file_path)
-#     else:
-#         return "Unsupported file type. Please provide a .ppt or .pptx file."
 def convert_pdf_to_image(file):
@@ -302,17 +362,13 @@ def extract_text_from_docx(file_path):
 def convert_doc_to_text(file_path):
     try:
-        subprocess.run(
-            ["unoconv", "--format", "txt", file_path],
             capture_output=True,
             text=True,
             check=True,
         )
-        txt_file_path = file_path.replace(".doc", ".txt")
-        with open(txt_file_path, "r") as f:
-            text = f.read()
-        text = text.lstrip("\ufeff")
-        os.remove(txt_file_path)
         return text
     except subprocess.CalledProcessError as e:
         print(f"Error converting {file_path} to text: {e}")

     return "\n\n".join(text_content)
+def is_meaningful_text(text: str) -> bool:
+    if not text or len(text) < 3:
+        return False
+    junk_patterns = [
+        r'^[^a-zA-Z]*$',  # no letters
+        r'^\W+$',         # only symbols
+        r'^.{1,2}$',      # too short
+    ]
+    if any(re.match(p, text) for p in junk_patterns):
+        return False
+    if re.search(r'[^\x20-\x7E]', text):  # non-printables
+        return False
+    letters = sum(1 for c in text if c.isalpha())
+    return letters / len(text) >= 0.3
+def extract_using_unicode_search(path: str) -> str:
+    with open(path, "rb") as file:
+        data = file.read()
+    text_blocks, current_text = [], b""
+    i = 0
+    while i < len(data) - 1:
+        b1, b2 = data[i], data[i + 1]
+        if 32 <= b1 <= 126 and b2 == 0:  # UTF-16 pattern
+            current_text += bytes([b1])
+            i += 2
+        elif b1 == 0 and current_text:
+            try:
+                text = current_text.decode("ascii", errors="ignore").strip()
+                if is_meaningful_text(text):
+                    text_blocks.append(text)
+            except:
+                pass
+            current_text = b""
+            i += 1
+        else:
+            if current_text:
+                try:
+                    text = current_text.decode("ascii", errors="ignore").strip()
+                    if is_meaningful_text(text):
+                        text_blocks.append(text)
+                except:
+                    pass
+                current_text = b""
+            i += 1
+    if current_text:
+        try:
+            text = current_text.decode("ascii", errors="ignore").strip()
+            if is_meaningful_text(text):
+                text_blocks.append(text)
+        except:
+            pass
+    # remove duplicates
+    unique, seen = [], set()
+    for block in text_blocks:
+        cleaned = re.sub(r"[^\w\s\.,;:!?\-]", "", block)
+        if cleaned not in seen and len(cleaned) > 5:
+            unique.append(block)
+            seen.add(cleaned)
+    return "\n".join(unique[:30]) if unique else "No text found"
+def extract_text_from_ppt(file_path: str) -> str:
+    """
+    Extract text from legacy PowerPoint (.ppt) files using Unicode pattern search.
+    Args:
+        file_path (str): Path to the .ppt file
+    Returns:
+        str: Extracted text from the presentation, or None if extraction fails
+    Raises:
+        FileNotFoundError: If the file doesn't exist
+        ValueError: If the file is not a valid .ppt file
+    """
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+    if not file_path.lower().endswith(".ppt"):
+        raise ValueError(f"Unsupported file format: {file_path}. Only .ppt files are supported.")
+    try:
+        return extract_using_unicode_search(file_path)
+    except Exception as e:
+        print(f"Error extracting text from {file_path}: {e}")
+        return None
 def convert_pdf_to_image(file):
 def convert_doc_to_text(file_path):
     try:
+        result = subprocess.run(
+            ["antiword", file_path],
             capture_output=True,
             text=True,
             check=True,
         )
+        text = result.stdout.lstrip("\ufeff")
         return text
     except subprocess.CalledProcessError as e:
         print(f"Error converting {file_path} to text: {e}")