Navya-Sree commited on
Commit
ed60c1b
·
verified ·
1 Parent(s): 166441f

Create utils/rag_system.py

Browse files
Files changed (1) hide show
  1. utils/rag_system.py +124 -0
utils/rag_system.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict
3
+ import chromadb
4
+ from chromadb.utils import embedding_functions
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.document_loaders import TextLoader
7
+
8
+ class RAGSystem:
9
+ """
10
+ Retrieval-Augmented Generation system for providing documentation context.
11
+ """
12
+
13
+ def __init__(self, collection_name="python_docs"):
14
+ self.client = chromadb.PersistentClient(path="./chroma_db")
15
+
16
+ # Use sentence transformers for embeddings
17
+ self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
18
+ model_name="all-MiniLM-L6-v2"
19
+ )
20
+
21
+ # Get or create collection
22
+ self.collection = self.client.get_or_create_collection(
23
+ name=collection_name,
24
+ embedding_function=self.embedding_function
25
+ )
26
+
27
+ # Load default documents if collection is empty
28
+ if self.collection.count() == 0:
29
+ self._load_default_documents()
30
+
31
+ def _load_default_documents(self):
32
+ """Load default Python documentation."""
33
+ default_docs = [
34
+ {
35
+ "id": "1",
36
+ "text": "Python functions are defined using the def keyword. Example: def hello(): return 'Hello'",
37
+ "metadata": {"source": "python_basics"}
38
+ },
39
+ {
40
+ "id": "2",
41
+ "text": "Use type hints for better code documentation. Example: def add(a: int, b: int) -> int:",
42
+ "metadata": {"source": "best_practices"}
43
+ },
44
+ {
45
+ "id": "3",
46
+ "text": "Always handle exceptions with try-except blocks to prevent crashes.",
47
+ "metadata": {"source": "error_handling"}
48
+ },
49
+ {
50
+ "id": "4",
51
+ "text": "Use list comprehensions for concise list creation: [x*2 for x in range(10)]",
52
+ "metadata": {"source": "python_tips"}
53
+ },
54
+ {
55
+ "id": "5",
56
+ "text": "Document your code with docstrings. Use triple quotes for multi-line documentation.",
57
+ "metadata": {"source": "documentation"}
58
+ }
59
+ ]
60
+
61
+ # Add documents to collection
62
+ self.collection.add(
63
+ documents=[doc["text"] for doc in default_docs],
64
+ metadatas=[doc["metadata"] for doc in default_docs],
65
+ ids=[doc["id"] for doc in default_docs]
66
+ )
67
+
68
+ def add_document(self, text: str, source: str = "user"):
69
+ """Add a new document to the knowledge base."""
70
+ doc_id = f"doc_{self.collection.count() + 1}"
71
+ self.collection.add(
72
+ documents=[text],
73
+ metadatas=[{"source": source}],
74
+ ids=[doc_id]
75
+ )
76
+
77
+ def search(self, query: str, n_results: int = 3) -> List[Dict]:
78
+ """
79
+ Search for relevant documents.
80
+
81
+ Args:
82
+ query: Search query
83
+ n_results: Number of results to return
84
+
85
+ Returns:
86
+ List of relevant documents
87
+ """
88
+ results = self.collection.query(
89
+ query_texts=[query],
90
+ n_results=n_results
91
+ )
92
+
93
+ documents = []
94
+ if results['documents']:
95
+ for i, doc in enumerate(results['documents'][0]):
96
+ documents.append({
97
+ "text": doc,
98
+ "metadata": results['metadatas'][0][i],
99
+ "distance": results['distances'][0][i]
100
+ })
101
+
102
+ return documents
103
+
104
+ def get_context(self, query: str) -> str:
105
+ """
106
+ Get relevant context for a coding query.
107
+
108
+ Args:
109
+ query: Coding task or question
110
+
111
+ Returns:
112
+ Context string from relevant documents
113
+ """
114
+ relevant_docs = self.search(query)
115
+
116
+ if not relevant_docs:
117
+ return ""
118
+
119
+ # Combine top documents into context
120
+ context_parts = ["Relevant documentation:"]
121
+ for i, doc in enumerate(relevant_docs[:2]): # Use top 2 documents
122
+ context_parts.append(f"{i+1}. {doc['text']}")
123
+
124
+ return "\n".join(context_parts)