from data_loader import load_documents from preprocess import clean_text from embedder import embed_documents dataset_path = "data/20_newsgroups" docs, labels = load_documents(dataset_path) print("Loaded:", len(docs)) docs = [clean_text(d) for d in docs] print("Preprocessing done") embeddings = embed_documents(docs[:100]) print("Embedding shape:", embeddings.shape)