Spaces:
Build error
Build error
File size: 4,282 Bytes
ed60c1b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import os
from typing import List, Dict
import chromadb
from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
class RAGSystem:
"""
Retrieval-Augmented Generation system for providing documentation context.
"""
def __init__(self, collection_name="python_docs"):
self.client = chromadb.PersistentClient(path="./chroma_db")
# Use sentence transformers for embeddings
self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2"
)
# Get or create collection
self.collection = self.client.get_or_create_collection(
name=collection_name,
embedding_function=self.embedding_function
)
# Load default documents if collection is empty
if self.collection.count() == 0:
self._load_default_documents()
def _load_default_documents(self):
"""Load default Python documentation."""
default_docs = [
{
"id": "1",
"text": "Python functions are defined using the def keyword. Example: def hello(): return 'Hello'",
"metadata": {"source": "python_basics"}
},
{
"id": "2",
"text": "Use type hints for better code documentation. Example: def add(a: int, b: int) -> int:",
"metadata": {"source": "best_practices"}
},
{
"id": "3",
"text": "Always handle exceptions with try-except blocks to prevent crashes.",
"metadata": {"source": "error_handling"}
},
{
"id": "4",
"text": "Use list comprehensions for concise list creation: [x*2 for x in range(10)]",
"metadata": {"source": "python_tips"}
},
{
"id": "5",
"text": "Document your code with docstrings. Use triple quotes for multi-line documentation.",
"metadata": {"source": "documentation"}
}
]
# Add documents to collection
self.collection.add(
documents=[doc["text"] for doc in default_docs],
metadatas=[doc["metadata"] for doc in default_docs],
ids=[doc["id"] for doc in default_docs]
)
def add_document(self, text: str, source: str = "user"):
"""Add a new document to the knowledge base."""
doc_id = f"doc_{self.collection.count() + 1}"
self.collection.add(
documents=[text],
metadatas=[{"source": source}],
ids=[doc_id]
)
def search(self, query: str, n_results: int = 3) -> List[Dict]:
"""
Search for relevant documents.
Args:
query: Search query
n_results: Number of results to return
Returns:
List of relevant documents
"""
results = self.collection.query(
query_texts=[query],
n_results=n_results
)
documents = []
if results['documents']:
for i, doc in enumerate(results['documents'][0]):
documents.append({
"text": doc,
"metadata": results['metadatas'][0][i],
"distance": results['distances'][0][i]
})
return documents
def get_context(self, query: str) -> str:
"""
Get relevant context for a coding query.
Args:
query: Coding task or question
Returns:
Context string from relevant documents
"""
relevant_docs = self.search(query)
if not relevant_docs:
return ""
# Combine top documents into context
context_parts = ["Relevant documentation:"]
for i, doc in enumerate(relevant_docs[:2]): # Use top 2 documents
context_parts.append(f"{i+1}. {doc['text']}")
return "\n".join(context_parts) |