Spaces:
Sleeping
Sleeping
| import os | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| def process_pdf(file_path): | |
| reader = PdfReader(file_path) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text.split('\n\n') # تقسيم النص إلى فقرات | |
| def process_docx(file_path): | |
| doc = Document(file_path) | |
| paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != ""] | |
| return paragraphs | |
| def process_txt(file_path): | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| text = f.read() | |
| return text.split('\n\n') | |
| def process_documents(file_path): | |
| ext = os.path.splitext(file_path)[1].lower() | |
| if ext == '.pdf': | |
| return process_pdf(file_path) | |
| elif ext == '.docx': | |
| return process_docx(file_path) | |
| elif ext == '.txt': | |
| return process_txt(file_path) | |
| else: | |
| return [] | |