Spaces:
Sleeping
Sleeping
| import os | |
| import sqlite3 | |
| from docx import Document | |
| import re | |
| from hazm import Normalizer | |
| import pypdf | |
| from nltk.tokenize import sent_tokenize | |
| from hazm import SentenceTokenizer # For Persian sentence tokenization | |
| def smart_chunking(text, max_tokens=1024, tokenizer=None): | |
| """ | |
| Splits the text into meaningful chunks using sentence boundaries. | |
| Ensures that each chunk does not exceed the maximum token limit. | |
| Supports both Persian and English text. | |
| """ | |
| # Step 1: Split text into sentences | |
| if any(lang_char in text for lang_char in "ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"): # Check for Persian characters | |
| # Use hazm for Persian sentence tokenization | |
| persian_sent_tokenizer = SentenceTokenizer() | |
| sentences = persian_sent_tokenizer.tokenize(text) | |
| else: | |
| # Use NLTK for English sentence tokenization | |
| sentences = sent_tokenize(text) | |
| # Step 2: Initialize variables | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| # Step 3: Add sentences to chunks | |
| for sentence in sentences: | |
| # Tokenize the sentence to estimate its length | |
| sentence_tokens = tokenizer.encode(sentence) if tokenizer else sentence.split() | |
| sentence_length = len(sentence_tokens) | |
| # If adding the sentence exceeds the max length, start a new chunk | |
| if current_length + sentence_length > max_tokens: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [] | |
| current_length = 0 | |
| # Add the sentence to the current chunk | |
| current_chunk.append(sentence) | |
| current_length += sentence_length | |
| # Add any remaining sentences as the last chunk | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| def is_meaningful(text): | |
| """ | |
| Determines whether the given text is considered meaningful based on the presence of a specific control character. | |
| This function checks if the input text contains the ASCII control character '\\x19' (End of Medium). | |
| If the character is found, the text is deemed not meaningful and the function returns 0. Otherwise, | |
| the text is considered meaningful and the function returns 1. | |
| Parameters: | |
| ---------- | |
| text : str | |
| The input text to be evaluated for meaningfulness. | |
| Returns: | |
| ------- | |
| int | |
| - 0: If the text contains the '\\x19' control character, indicating it is not meaningful. | |
| - 1: If the text does not contain the '\\x19' control character, indicating it is meaningful. | |
| Example: | |
| -------- | |
| >>> is_meaningful("This is a valid sentence.") | |
| 1 | |
| >>> is_meaningful("Invalid text \\x19 with control character.") | |
| 0 | |
| """ | |
| if "\x19" in text: | |
| return 0 | |
| return 1 | |
| # Step 1: Text Cleaning | |
| def clean_text(text): | |
| """ | |
| Cleans the input text by removing unwanted patterns and retaining only Persian characters and spaces. | |
| This function performs the following cleaning steps: | |
| 1. Removes URLs, emails, and other web-related patterns (e.g., http, https, www). | |
| 2. Replaces multiple consecutive spaces with a single space. | |
| 3. Retains only Persian characters (Unicode range \\u0600-\\u06FF) and spaces, removing all other characters. | |
| 4. Strips leading and trailing whitespace from the resulting text. | |
| Parameters: | |
| ---------- | |
| text : str | |
| The input text to be cleaned. | |
| Returns: | |
| ------- | |
| str | |
| The cleaned text containing only Persian characters and spaces, with unnecessary patterns removed. | |
| Example: | |
| -------- | |
| >>> clean_text("سلام! این یک متن آزمایشی است. http://example.com و ایمیل: test@example.com") | |
| 'سلام این یک متن آزمایشی است' | |
| >>> clean_text(" متون با فاصله های زیاد ") | |
| 'متون با فاصله های زیاد' | |
| """ | |
| # Remove URLs, emails, and other patterns | |
| text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) | |
| text = re.sub(r"\s+", " ", text) # Replace multiple spaces with a single space | |
| # text = re.sub(r"[^\u0600-\u06FF\s]", "", text) # Keep only Persian characters and spaces | |
| return text.strip() | |
| # Step 2: Normalization | |
| def normalize_text(text): | |
| """ | |
| Normalizes the input Persian text by standardizing characters and applying common normalization rules. | |
| This function uses the `Normalizer` class from the `hazm` library to perform the following tasks: | |
| 1. Standardize Persian characters (e.g., converting Arabic characters to their Persian equivalents). | |
| 2. Apply common normalization rules such as fixing spacing, removing diacritics, and handling special cases. | |
| Parameters: | |
| ---------- | |
| text : str | |
| The input Persian text to be normalized. | |
| Returns: | |
| ------- | |
| str | |
| The normalized Persian text with standardized characters and consistent formatting. | |
| Example: | |
| -------- | |
| >>> normalize_text("سلامٔ دوست عزیز، حال شما چطور است؟") | |
| 'سلام دوست عزیز، حال شما چطور است؟' | |
| >>> normalize_text("متن با اضافهی فاصلههای نامنظم.") | |
| 'متن با اضافهی فاصلههای نامنظم.' | |
| """ | |
| normalizer = Normalizer() | |
| text = normalizer.normalize(text) # Standardize Persian characters | |
| return text | |
| # Full Preprocessing Pipeline | |
| def preprocess_persian_text(text): | |
| """ | |
| Preprocesses Persian text by cleaning and normalizing it. | |
| This function performs the following steps: | |
| 1. Cleans the input text using the `clean_text` function: | |
| - Removes URLs, emails, and other unwanted patterns. | |
| - Replaces multiple spaces with a single space. | |
| - Retains only Persian characters and spaces. | |
| 2. Normalizes the cleaned text using the `normalize_text` function: | |
| - Standardizes Persian characters (e.g., converting Arabic characters to their Persian equivalents). | |
| - Applies common normalization rules such as fixing spacing and removing diacritics. | |
| Parameters: | |
| ---------- | |
| text : str | |
| The input Persian text to be preprocessed. | |
| Returns: | |
| ------- | |
| str | |
| The preprocessed Persian text, which is cleaned and normalized. | |
| Example: | |
| -------- | |
| >>> preprocess_persian_text("سلامٔ دوست عزیز! این یک متن آزمایشی است: http://example.com") | |
| 'سلام دوست عزیز این یک متن آزمایشی است' | |
| >>> preprocess_persian_text(" متون با فاصلههای نامنظم و کلمات عربی مثل شیء ") | |
| 'متون با فاصلههای نامنظم و کلمات عربی مثل شیء' | |
| """ | |
| text = clean_text(text) | |
| text = normalize_text(text) | |
| return text | |
| def read_file(file_path): | |
| """ | |
| Reads and preprocesses text from Word (.docx), Text (.txt), or PDF (.pdf) files. | |
| This function supports reading Persian text from the following file formats: | |
| 1. `.docx`: Extracts text from paragraphs in a Word document. | |
| 2. `.txt`: Reads plain text from a text file encoded in UTF-8. | |
| 3. `.pdf`: Extracts text from a PDF file using `pypdf`. | |
| After extracting the raw text, the function preprocesses it using the `preprocess_persian_text` function, | |
| which cleans and normalizes the Persian text. | |
| Parameters: | |
| ---------- | |
| file_path : str | |
| The path to the input file. Supported formats are `.docx`, `.txt`, and `.pdf`. | |
| Returns: | |
| ------- | |
| str | |
| The preprocessed Persian text extracted from the file. | |
| Raises: | |
| ------ | |
| ValueError | |
| - If the file format is unsupported (only `.docx`, `.txt`, and `.pdf` are allowed). | |
| - If the extracted text from a PDF file is deemed not meaningful (e.g., contains control characters). | |
| Example: | |
| -------- | |
| >>> read_file("example.docx") | |
| 'سلام دوست عزیز این یک متن آزمایشی است' | |
| >>> read_file("example.txt") | |
| 'این یک فایل متنی ساده است.' | |
| >>> read_file("example.pdf") | |
| 'این متن از یک فایل پی دی اف استخراج شده است.' | |
| """ | |
| if file_path.endswith('.docx'): | |
| doc = Document(file_path) | |
| text = "\n".join([para.text for para in doc.paragraphs]) | |
| return preprocess_persian_text(text) | |
| elif file_path.endswith('.txt'): | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| text = f.read() | |
| return preprocess_persian_text(text) | |
| elif file_path.endswith('.pdf'): | |
| reader = pypdf.PdfReader(file_path) | |
| raw_data = "" | |
| for idx in range(len(reader.pages)): | |
| raw_data += reader.pages[idx].extract_text() | |
| if not is_meaningful(raw_data): | |
| print("this text not supported") | |
| raise ValueError("Unsupported file format.") | |
| return preprocess_persian_text(raw_data) | |
| else: | |
| raise ValueError("Unsupported file format. Only .docx and .txt are allowed.") | |