Spaces:

Rezuwan
/

USB_Unmad_Satirical_Bot

Sleeping

App Files Files Community

Rezuwan commited on Jul 9, 2025

Commit

a9328e8

verified ·

1 Parent(s): 729d844

Upload 2 files

Browse files

Files changed (2) hide show

app.py +62 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+from typing import Optional
+import docx
+from PIL import Image
+import pytesseract
+from pdf2image import convert_from_path
+import fitz  # PyMuPDF
+def extract_text_from_txt(file_path: str) -> str:
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.read()
+def extract_text_from_md(file_path: str) -> str:
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.read()
+def extract_text_from_pdf(file_path: str) -> str:
+    text = ""
+    with fitz.open(file_path) as doc:
+        for page in doc:
+            text += page.get_text()
+    return text
+def extract_text_from_docx(file_path: str) -> str:
+    doc = docx.Document(file_path)
+    return '\n'.join([para.text for para in doc.paragraphs])
+def extract_text_from_image(file_path: str) -> str:
+    image = Image.open(file_path)
+    return pytesseract.image_to_string(image)
+def extract_text_from_scanned_pdf(file_path: str) -> str:
+    images = convert_from_path(file_path)
+    text = ""
+    for image in images:
+        text += pytesseract.image_to_string(image)
+    return text
+def extract_text(file_path: str) -> Optional[str]:
+    ext = os.path.splitext(file_path)[-1].lower()
+    if ext == '.txt':
+        return extract_text_from_txt(file_path)
+    elif ext == '.md':
+        return extract_text_from_md(file_path)
+    elif ext == '.pdf':
+        try:
+            text = extract_text_from_pdf(file_path)
+            if not text.strip():
+                text = extract_text_from_scanned_pdf(file_path)
+            return text
+        except Exception as e:
+            print(f"Error reading PDF: {e}")
+            return None
+    elif ext == '.docx':
+        return extract_text_from_docx(file_path)
+    elif ext in ['.jpg', '.jpeg', '.png']:
+        return extract_text_from_image(file_path)
+    else:
+        print(f"Unsupported file type: {ext}")
+        return None

requirements.txt ADDED Viewed

Binary file (312 Bytes). View file