File size: 2,405 Bytes
8098153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

import json
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
import os

DATA_PATH = "data/final_cleaned_student_data.json"
DB_PATH = "vector_store"

def create_vector_db():
    """Reads the cleaned student data and builds a FAISS vector store."""
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        student_data = json.load(f)

    print("Preparing documents for vectorization...")
    documents = []
    for enroll_no, data in student_data.items():
        name = data.get("name")
        
        # Create a document for each major section of the student's profile
        if data.get("academic_profile"):
            documents.append(Document(
                page_content=json.dumps(data["academic_profile"]),
                metadata={"enrollment_no": enroll_no, "student_name": name, "data_source": "academics"}
            ))
        
        if data.get("coding_profiles", {}).get("leetcode"):
            documents.append(Document(
                page_content=json.dumps(data["coding_profiles"]["leetcode"]),
                metadata={"enrollment_no": enroll_no, "student_name": name, "data_source": "leetcode"}
            ))

        if data.get("coding_profiles", {}).get("github"):
            documents.append(Document(
                page_content=json.dumps(data["coding_profiles"]["github"]),
                metadata={"enrollment_no": enroll_no, "student_name": name, "data_source": "github"}
            ))

        if data.get("coding_profiles", {}).get("codeforces"):
            documents.append(Document(
                page_content=json.dumps(data["coding_profiles"]["codeforces"]),
                metadata={"enrollment_no": enroll_no, "student_name": name, "data_source": "codeforces"}
            ))

    print(f"Created {len(documents)} documents.")

    # Use a standard, high-performance embedding model
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

    print("Creating FAISS vector store...")
    db = FAISS.from_documents(documents, embeddings)
    
    # Save the vector store locally
    if not os.path.exists(DB_PATH):
        os.makedirs(DB_PATH)
    db.save_local(DB_PATH)
    
    print(f"✅ Vector store created and saved at '{DB_PATH}'")

if __name__ == "__main__":
    create_vector_db()