Semantic-search-cache / src /playground_docs.py
YENUGU SUJITH REDDY
Initial commit with Git LFS models
45fe8b6
import os
import pickle
data_path = "data/20_newsgroups"
documents = []
for root, dirs, files in os.walk(data_path):
for file in files:
file_path = os.path.join(root, file)
try:
with open(file_path, "r", encoding="latin1") as f:
documents.append(f.read())
except:
pass
print("Total documents loaded:", len(documents))
os.makedirs("models", exist_ok=True)
with open("models/documents.pkl", "wb") as f:
pickle.dump(documents, f)
print("documents.pkl saved successfully!")