Sathvik-kota's picture
Upload folder using huggingface_hub
d0abef8 verified
# src/doc_service/utils.py
import os
import hashlib
import re
def load_text_files(folder_path: str):
docs = []
for fname in sorted(os.listdir(folder_path)):
if fname.endswith(".txt"):
full_path = os.path.join(folder_path, fname)
with open(full_path, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
docs.append({
"filename": fname,
"path": full_path,
"text": text
})
return docs
def load_original_text(folder_path: str, filename: str):
path = os.path.join(folder_path, filename)
with open(path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
def clean_text(text: str) -> str:
text = text.lower()
text = re.sub(r'<.*?>', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def compute_hash(text: str) -> str:
return hashlib.md5(text.encode("utf-8")).hexdigest()
def preprocess_documents(folder_path: str):
raw_docs = load_text_files(folder_path)
result = []
for doc in raw_docs:
cleaned = clean_text(doc["text"])
h = compute_hash(cleaned)
result.append({
"filename": doc["filename"],
"clean_text": cleaned,
"hash": h,
"length": len(cleaned.split()),
"original_text": doc["text"]
})
return result