| import os | |
| import re | |
| from docx import Document | |
| from PyPDF2 import PdfReader | |
| def read_txt_file(file_path): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| def read_docx_file(file_path): | |
| doc = Document(file_path) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| def read_pdf_file(file_path): | |
| reader = PdfReader(file_path) | |
| return "\n".join(page.extract_text() or "" for page in reader.pages) | |
| def split_arabic_text(text, chunk_size=500): | |
| sentences = re.split(r'(?<=[.؟!])\s+', text) | |
| chunks = [] | |
| current = "" | |
| for sentence in sentences: | |
| if len(current) + len(sentence) <= chunk_size: | |
| current += sentence + " " | |
| else: | |
| chunks.append(current.strip()) | |
| current = sentence + " " | |
| if current: | |
| chunks.append(current.strip()) | |
| return chunks | |
| def process_documents(file_path): | |
| ext = os.path.splitext(file_path)[1].lower() | |
| if ext == ".txt": | |
| text = read_txt_file(file_path) | |
| elif ext == ".docx": | |
| text = read_docx_file(file_path) | |
| elif ext == ".pdf": | |
| text = read_pdf_file(file_path) | |
| else: | |
| return [] | |
| clean_text = text.replace('\n', ' ').replace('\r', ' ').strip() | |
| return split_arabic_text(clean_text) | |