Spaces:
Paused
Paused
Update modules/document_processor.py
Browse files
modules/document_processor.py
CHANGED
|
@@ -12,7 +12,6 @@ import docx
|
|
| 12 |
import PyPDF2
|
| 13 |
import fitz # PyMuPDF
|
| 14 |
import pdfplumber
|
| 15 |
-
import mammoth
|
| 16 |
from openpyxl import load_workbook
|
| 17 |
from PIL import Image
|
| 18 |
import pytesseract
|
|
@@ -125,10 +124,6 @@ class DocumentProcessor:
|
|
| 125 |
try:
|
| 126 |
doc = docx.Document(file_path)
|
| 127 |
extracted_data["text"] = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
| 128 |
-
if not extracted_data["text"].strip():
|
| 129 |
-
with open(file_path, "rb") as docx_file:
|
| 130 |
-
result = mammoth.extract_raw_text(docx_file)
|
| 131 |
-
extracted_data["text"] = result.value
|
| 132 |
except Exception as e:
|
| 133 |
extracted_data["error"] = f"خطأ في معالجة ملف DOCX: {str(e)}"
|
| 134 |
-
return extracted_data
|
|
|
|
| 12 |
import PyPDF2
|
| 13 |
import fitz # PyMuPDF
|
| 14 |
import pdfplumber
|
|
|
|
| 15 |
from openpyxl import load_workbook
|
| 16 |
from PIL import Image
|
| 17 |
import pytesseract
|
|
|
|
| 124 |
try:
|
| 125 |
doc = docx.Document(file_path)
|
| 126 |
extracted_data["text"] = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
except Exception as e:
|
| 128 |
extracted_data["error"] = f"خطأ في معالجة ملف DOCX: {str(e)}"
|
| 129 |
+
return extracted_data
|