Spaces:
Sleeping
Sleeping
Commit ·
db8ee02
1
Parent(s): 7414af1
checking changes
Browse files- services/rag_chatbot_service.py +73 -18
services/rag_chatbot_service.py
CHANGED
|
@@ -6,6 +6,7 @@ import logging
|
|
| 6 |
from typing import List, Dict, Any, Optional
|
| 7 |
from datetime import datetime, timezone
|
| 8 |
from pydantic import BaseModel
|
|
|
|
| 9 |
|
| 10 |
from config.settings import settings
|
| 11 |
|
|
@@ -34,9 +35,53 @@ class RAGChatbotService:
|
|
| 34 |
This implementation uses in-memory search as fallback.
|
| 35 |
"""
|
| 36 |
|
|
|
|
|
|
|
|
|
|
| 37 |
def __init__(self):
|
| 38 |
self.use_vector_db = False # Set True when ChromaDB is available
|
| 39 |
self._embeddings_cache = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
async def answer_question(
|
| 42 |
self,
|
|
@@ -74,28 +119,38 @@ class RAGChatbotService:
|
|
| 74 |
|
| 75 |
# If no indexed content, try to fetch README directly from GitHub
|
| 76 |
if not has_indexed_content:
|
| 77 |
-
logger.info(f"No indexed content for {repo_name}, fetching README
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/README.md"
|
| 91 |
response = await client.get(url)
|
| 92 |
if response.status_code == 200:
|
| 93 |
readme_content = response.text
|
| 94 |
-
|
|
|
|
|
|
|
| 95 |
else:
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# Search for relevant documents (from indexed chunks)
|
| 101 |
relevant_docs = await self.search_documents(question, repo_name, top_k)
|
|
|
|
| 6 |
from typing import List, Dict, Any, Optional
|
| 7 |
from datetime import datetime, timezone
|
| 8 |
from pydantic import BaseModel
|
| 9 |
+
import time
|
| 10 |
|
| 11 |
from config.settings import settings
|
| 12 |
|
|
|
|
| 35 |
This implementation uses in-memory search as fallback.
|
| 36 |
"""
|
| 37 |
|
| 38 |
+
# Cache settings
|
| 39 |
+
README_CACHE_TTL = 600 # 10 minutes in seconds
|
| 40 |
+
|
| 41 |
def __init__(self):
|
| 42 |
self.use_vector_db = False # Set True when ChromaDB is available
|
| 43 |
self._embeddings_cache = {}
|
| 44 |
+
self._readme_cache = {} # {repo_key: {"content": str, "timestamp": float}}
|
| 45 |
+
|
| 46 |
+
def _get_cached_readme(self, repo_name: str) -> Optional[str]:
|
| 47 |
+
"""
|
| 48 |
+
Get cached README if it exists and hasn't expired.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
repo_name: Repository name (owner/repo format)
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
README content if cached and valid, None otherwise
|
| 55 |
+
"""
|
| 56 |
+
if repo_name not in self._readme_cache:
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
cache_entry = self._readme_cache[repo_name]
|
| 60 |
+
age = time.time() - cache_entry["timestamp"]
|
| 61 |
+
|
| 62 |
+
if age > self.README_CACHE_TTL:
|
| 63 |
+
# Cache expired, remove it
|
| 64 |
+
del self._readme_cache[repo_name]
|
| 65 |
+
logger.info(f"README cache expired for {repo_name} (age: {age:.1f}s)")
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
logger.info(f"✅ Serving README from cache for {repo_name} (age: {age:.1f}s)")
|
| 69 |
+
return cache_entry["content"]
|
| 70 |
+
|
| 71 |
+
def _cache_readme(self, repo_name: str, content: str) -> None:
|
| 72 |
+
"""
|
| 73 |
+
Cache README content with timestamp.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
repo_name: Repository name (owner/repo format)
|
| 77 |
+
content: README content to cache
|
| 78 |
+
"""
|
| 79 |
+
self._readme_cache[repo_name] = {
|
| 80 |
+
"content": content,
|
| 81 |
+
"timestamp": time.time()
|
| 82 |
+
}
|
| 83 |
+
logger.info(f"📝 Cached README for {repo_name} ({len(content)} chars)")
|
| 84 |
+
|
| 85 |
|
| 86 |
async def answer_question(
|
| 87 |
self,
|
|
|
|
| 119 |
|
| 120 |
# If no indexed content, try to fetch README directly from GitHub
|
| 121 |
if not has_indexed_content:
|
| 122 |
+
logger.info(f"No indexed content for {repo_name}, checking cache and fetching README if needed...")
|
| 123 |
+
|
| 124 |
+
# Stage 1: Check cache first
|
| 125 |
+
cached_readme = self._get_cached_readme(repo_name)
|
| 126 |
+
if cached_readme:
|
| 127 |
+
readme_content = cached_readme
|
| 128 |
+
else:
|
| 129 |
+
# Cache miss - fetch from GitHub
|
| 130 |
+
try:
|
| 131 |
+
owner, repo = repo_name.split('/')
|
| 132 |
+
url = f"https://raw.githubusercontent.com/{owner}/{repo}/main/README.md"
|
| 133 |
+
|
| 134 |
+
async with httpx.AsyncClient(timeout=10) as client:
|
|
|
|
| 135 |
response = await client.get(url)
|
| 136 |
if response.status_code == 200:
|
| 137 |
readme_content = response.text
|
| 138 |
+
# Cache the fetched content
|
| 139 |
+
self._cache_readme(repo_name, readme_content)
|
| 140 |
+
logger.info(f"Successfully fetched README for {repo_name} ({len(readme_content)} chars)")
|
| 141 |
else:
|
| 142 |
+
# Try master branch instead
|
| 143 |
+
url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/README.md"
|
| 144 |
+
response = await client.get(url)
|
| 145 |
+
if response.status_code == 200:
|
| 146 |
+
readme_content = response.text
|
| 147 |
+
# Cache the fetched content
|
| 148 |
+
self._cache_readme(repo_name, readme_content)
|
| 149 |
+
logger.info(f"Successfully fetched README (master) for {repo_name} ({len(readme_content)} chars)")
|
| 150 |
+
else:
|
| 151 |
+
logger.warning(f"README not found at {url}")
|
| 152 |
+
except Exception as e:
|
| 153 |
+
logger.error(f"Error fetching README for {repo_name}: {e}")
|
| 154 |
|
| 155 |
# Search for relevant documents (from indexed chunks)
|
| 156 |
relevant_docs = await self.search_documents(question, repo_name, top_k)
|