Spaces:
Sleeping
Sleeping
| import json | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.docstore.document import Document | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| import os | |
| DATA_PATH = "data/final_cleaned_student_data.json" | |
| DB_PATH = "vector_store" | |
| def create_vector_db(): | |
| """Reads the cleaned student data and builds a FAISS vector store.""" | |
| with open(DATA_PATH, 'r', encoding='utf-8') as f: | |
| student_data = json.load(f) | |
| print("Preparing documents for vectorization...") | |
| documents = [] | |
| for enroll_no, data in student_data.items(): | |
| name = data.get("name") | |
| # Create a document for each major section of the student's profile | |
| if data.get("academic_profile"): | |
| documents.append(Document( | |
| page_content=json.dumps(data["academic_profile"]), | |
| metadata={"enrollment_no": enroll_no, "student_name": name, "data_source": "academics"} | |
| )) | |
| if data.get("coding_profiles", {}).get("leetcode"): | |
| documents.append(Document( | |
| page_content=json.dumps(data["coding_profiles"]["leetcode"]), | |
| metadata={"enrollment_no": enroll_no, "student_name": name, "data_source": "leetcode"} | |
| )) | |
| if data.get("coding_profiles", {}).get("github"): | |
| documents.append(Document( | |
| page_content=json.dumps(data["coding_profiles"]["github"]), | |
| metadata={"enrollment_no": enroll_no, "student_name": name, "data_source": "github"} | |
| )) | |
| if data.get("coding_profiles", {}).get("codeforces"): | |
| documents.append(Document( | |
| page_content=json.dumps(data["coding_profiles"]["codeforces"]), | |
| metadata={"enrollment_no": enroll_no, "student_name": name, "data_source": "codeforces"} | |
| )) | |
| print(f"Created {len(documents)} documents.") | |
| # Use a standard, high-performance embedding model | |
| embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') | |
| print("Creating FAISS vector store...") | |
| db = FAISS.from_documents(documents, embeddings) | |
| # Save the vector store locally | |
| if not os.path.exists(DB_PATH): | |
| os.makedirs(DB_PATH) | |
| db.save_local(DB_PATH) | |
| print(f"✅ Vector store created and saved at '{DB_PATH}'") | |
| if __name__ == "__main__": | |
| create_vector_db() |