Spaces:
Sleeping
Sleeping
| # src/doc_service/utils.py | |
| import os | |
| import hashlib | |
| import re | |
| def load_text_files(folder_path: str): | |
| docs = [] | |
| for fname in sorted(os.listdir(folder_path)): | |
| if fname.endswith(".txt"): | |
| full_path = os.path.join(folder_path, fname) | |
| with open(full_path, "r", encoding="utf-8", errors="ignore") as f: | |
| text = f.read() | |
| docs.append({ | |
| "filename": fname, | |
| "path": full_path, | |
| "text": text | |
| }) | |
| return docs | |
| def load_original_text(folder_path: str, filename: str): | |
| path = os.path.join(folder_path, filename) | |
| with open(path, "r", encoding="utf-8", errors="ignore") as f: | |
| return f.read() | |
| def clean_text(text: str) -> str: | |
| text = text.lower() | |
| text = re.sub(r'<.*?>', '', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def compute_hash(text: str) -> str: | |
| return hashlib.md5(text.encode("utf-8")).hexdigest() | |
| def preprocess_documents(folder_path: str): | |
| raw_docs = load_text_files(folder_path) | |
| result = [] | |
| for doc in raw_docs: | |
| cleaned = clean_text(doc["text"]) | |
| h = compute_hash(cleaned) | |
| result.append({ | |
| "filename": doc["filename"], | |
| "clean_text": cleaned, | |
| "hash": h, | |
| "length": len(cleaned.split()), | |
| "original_text": doc["text"] | |
| }) | |
| return result | |