from data_loader import load_documents
from preprocess import clean_text
from embedder import embed_documents


dataset_path = "data/20_newsgroups"

docs, labels = load_documents(dataset_path)

print("Loaded:", len(docs))

docs = [clean_text(d) for d in docs]

print("Preprocessing done")

embeddings = embed_documents(docs[:100])

print("Embedding shape:", embeddings.shape)