File size: 4,282 Bytes
ed60c1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
from typing import List, Dict
import chromadb
from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

class RAGSystem:
    """
    Retrieval-Augmented Generation system for providing documentation context.
    """
    
    def __init__(self, collection_name="python_docs"):
        self.client = chromadb.PersistentClient(path="./chroma_db")
        
        # Use sentence transformers for embeddings
        self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="all-MiniLM-L6-v2"
        )
        
        # Get or create collection
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=self.embedding_function
        )
        
        # Load default documents if collection is empty
        if self.collection.count() == 0:
            self._load_default_documents()
    
    def _load_default_documents(self):
        """Load default Python documentation."""
        default_docs = [
            {
                "id": "1",
                "text": "Python functions are defined using the def keyword. Example: def hello(): return 'Hello'",
                "metadata": {"source": "python_basics"}
            },
            {
                "id": "2",
                "text": "Use type hints for better code documentation. Example: def add(a: int, b: int) -> int:",
                "metadata": {"source": "best_practices"}
            },
            {
                "id": "3", 
                "text": "Always handle exceptions with try-except blocks to prevent crashes.",
                "metadata": {"source": "error_handling"}
            },
            {
                "id": "4",
                "text": "Use list comprehensions for concise list creation: [x*2 for x in range(10)]",
                "metadata": {"source": "python_tips"}
            },
            {
                "id": "5",
                "text": "Document your code with docstrings. Use triple quotes for multi-line documentation.",
                "metadata": {"source": "documentation"}
            }
        ]
        
        # Add documents to collection
        self.collection.add(
            documents=[doc["text"] for doc in default_docs],
            metadatas=[doc["metadata"] for doc in default_docs],
            ids=[doc["id"] for doc in default_docs]
        )
    
    def add_document(self, text: str, source: str = "user"):
        """Add a new document to the knowledge base."""
        doc_id = f"doc_{self.collection.count() + 1}"
        self.collection.add(
            documents=[text],
            metadatas=[{"source": source}],
            ids=[doc_id]
        )
    
    def search(self, query: str, n_results: int = 3) -> List[Dict]:
        """
        Search for relevant documents.
        
        Args:
            query: Search query
            n_results: Number of results to return
            
        Returns:
            List of relevant documents
        """
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results
        )
        
        documents = []
        if results['documents']:
            for i, doc in enumerate(results['documents'][0]):
                documents.append({
                    "text": doc,
                    "metadata": results['metadatas'][0][i],
                    "distance": results['distances'][0][i]
                })
        
        return documents
    
    def get_context(self, query: str) -> str:
        """
        Get relevant context for a coding query.
        
        Args:
            query: Coding task or question
            
        Returns:
            Context string from relevant documents
        """
        relevant_docs = self.search(query)
        
        if not relevant_docs:
            return ""
        
        # Combine top documents into context
        context_parts = ["Relevant documentation:"]
        for i, doc in enumerate(relevant_docs[:2]):  # Use top 2 documents
            context_parts.append(f"{i+1}. {doc['text']}")
        
        return "\n".join(context_parts)