Spaces:
Sleeping
Sleeping
| import fitz # pymupdf | |
| from docx import Document | |
| import pptx | |
| import os | |
| from typing import Optional | |
| def extract_text_from_pdf(file_path: str) -> Optional[str]: | |
| """ | |
| استخراج النص من ملف PDF باستخدام pymupdf (أسرع من tika). | |
| """ | |
| try: | |
| doc = fitz.open(file_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text.strip() if text else None | |
| except Exception as e: | |
| print(f"Error reading PDF: {e}") | |
| return None | |
| def extract_text_from_docx(file_path: str) -> Optional[str]: | |
| """ | |
| استخراج النص من ملف Word (DOCX). | |
| """ | |
| try: | |
| doc = Document(file_path) | |
| return "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) | |
| except Exception as e: | |
| print(f"Error reading DOCX: {e}") | |
| return None | |
| def extract_text_from_pptx(file_path: str) -> Optional[str]: | |
| """ | |
| استخراج النص من ملف PowerPoint (PPTX). | |
| """ | |
| try: | |
| presentation = pptx.Presentation(file_path) | |
| text = [] | |
| for slide in presentation.slides: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text"): | |
| text.append(shape.text) | |
| return "\n".join(text) if text else None | |
| except Exception as e: | |
| print(f"Error reading PPTX: {e}") | |
| return None | |
| def extract_text_from_document(file_path: str) -> Optional[str]: | |
| """ | |
| دالة موحدة لاستخراج النص من أي مستند (PDF/DOCX/PPTX/TXT). | |
| """ | |
| if not os.path.exists(file_path): | |
| print(f"File not found: {file_path}") | |
| return None | |
| if file_path.lower().endswith('.pdf'): | |
| return extract_text_from_pdf(file_path) | |
| elif file_path.lower().endswith('.docx'): | |
| return extract_text_from_docx(file_path) | |
| elif file_path.lower().endswith('.pptx'): | |
| return extract_text_from_pptx(file_path) | |
| elif file_path.lower().endswith('.txt'): | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| except Exception as e: | |
| print(f"Error reading TXT: {e}") | |
| return None | |
| else: | |
| print(f"Unsupported file format: {file_path}") | |
| return None |