File size: 5,701 Bytes
06640a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import json
import chromadb
import os
import requests
from chromadb.utils import embedding_functions
from groq import Groq

class PortfolioRAG:
    def __init__(self, data_path="data/metadata.json", db_path="chroma_db", model_name="gpt-oss:20b-cloud"):
        self.data_path = data_path
        self.db_path = db_path
        self.groq_api_key = os.environ.get("GROQ_API_KEY")
        if self.groq_api_key:
            self.groq_client = Groq(api_key=self.groq_api_key)
        else:
            print("WARNING: GROQ_API_KEY environment variable not set!")
        self.collection = None
        self._init_knowledge_base()

    def _init_knowledge_base(self):
        print("Initializing Local Vector Database (ChromaDB)...")
        client = chromadb.PersistentClient(path=self.db_path)
        default_ef = embedding_functions.DefaultEmbeddingFunction()

        self.collection = client.get_or_create_collection(
            name="meta_portfolio",
            embedding_function=default_ef
        )
        
        if not os.path.exists(self.data_path):
            print(f"Error: {self.data_path} not found.")
            return
            
        with open(self.data_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            
        if isinstance(data, dict):
            data = [data]
            
        documents = []
        metadatas = []
        ids = []
        
        for i, item in enumerate(data):
            project_name = item.get("project", f"Project_{i}")
            category = item.get("category", "Unknown")
            domain = item.get("domain", "Unknown Domain")
            link = item.get("link", "No link provided")
            achievement = item.get("key_achievement", "")
            challenge = item.get("challenge", "")
            
            extra_info = ""
            if category.lower() == "project" or "language" in item:
                language = ", ".join(item.get("language", []))
                tech_stack = ", ".join(item.get("tech_stack", []))
                extra_info = (
                    f"Languages: {language}\n"
                    f"Tech Stack: {tech_stack}"
                )
            elif category.lower() == "article" or "abstract" in item:
                abstract = item.get("abstract", "")
                extra_info = f"Abstract: {abstract}"
            
            doc_text = (
                f"Project Name: {project_name}\n"
                f"Domain: {domain}\n"
                f"Category: {category}\n"
                f"Link: {link}\n"
                f"{extra_info}\n"
                f"Key Achievement: {achievement}\n"
                f"Technical Challenge: {challenge}"
            )
            
            documents.append(doc_text)
            meta_dict = {
                "project": project_name, 
                "domain": domain,
                "category": category,
                "link": link,
                "key_achievement": achievement,
                "challenge": challenge
            }

            if item.get("language"):
                meta_dict["language"] = ", ".join(item["language"])  
            if item.get("tech_stack"):
                meta_dict["tech_stack"] = ", ".join(item["tech_stack"])
            if item.get("abstract"):
                meta_dict["abstract"] = item["abstract"]

            metadatas.append(meta_dict)
            ids.append(f"doc_{i}")
            
        self.collection.upsert(documents=documents, metadatas=metadatas, ids=ids)
        print(f"Knowledge Base ready! ({len(documents)} items stored)\n")

    def retrieve_context(self, query, n_results=2):
        if not self.collection:
            return "", []
            
        print("Retrieving relevant context from ChromaDB...")
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results
        )        
        context_text = ""
        source_metadata = []
        
        if results and results["documents"] and results["documents"][0]:
            context_text = "\n\n".join(results["documents"][0])
            if "metadatas" in results and results["metadatas"][0]:
                source_metadata = results["metadatas"][0]
                
        return context_text, source_metadata

    def generate_response(self, query, context):
        print(f"Asking local Ollama ({self.ollama_model})...")
        
        prompt = f"""You are an AI assistant representing the portfolio of Edmond Song, a Senior DeFi and AI Research Engineer.
        Based ONLY on the following context about Edmond's projects and articles, answer the user's question clearly and professionally.

        Context:
        {context}

        Question:
        {query}

        Answer:"""

        try:
            chat_completion = self.groq_client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model="llama3-8b-8192", 
                temperature=0.3, 
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            return f"Error communicating with Groq API: {str(e)}"

    def chat(self, query):
        context_text, source_metadata = self.retrieve_context(query)
        if context_text:
            ai_response = self.generate_response(query, context_text)
            return {
                "answer": ai_response,
                "metadata": source_metadata
            }
        
        return {
            "answer": "I don't have enough context to answer that.",
            "metadata": []
        }