Spaces:
Build error
Build error
SyedFarooqAlii commited on
Commit ·
4711fe9
1
Parent(s): f309037
add
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Dockerfile +3 -35
- agent.py +249 -0
- api.py +215 -0
- app/__init__.py +0 -3
- app/__pycache__/__init__.cpython-312.pyc +0 -0
- app/__pycache__/config.cpython-312.pyc +0 -0
- app/__pycache__/main.cpython-312.pyc +0 -0
- app/api/__pycache__/chat.cpython-312.pyc +0 -0
- app/api/__pycache__/health.cpython-312.pyc +0 -0
- app/api/__pycache__/ingest.cpython-312.pyc +0 -0
- app/api/chat.py +0 -249
- app/api/health.py +0 -160
- app/api/ingest.py +0 -316
- app/config.py +0 -45
- app/database/__pycache__/database.cpython-312.pyc +0 -0
- app/database/__pycache__/models.cpython-312.pyc +0 -0
- app/database/__pycache__/repositories.cpython-312.pyc +0 -0
- app/database/database.py +0 -56
- app/database/models.py +0 -138
- app/database/repositories.py +0 -217
- app/embeddings/__pycache__/minimal_embedding_generator.cpython-312.pyc +0 -0
- app/embeddings/minimal_embedding_generator.py +0 -79
- app/generation/__pycache__/response_generator.cpython-312.pyc +0 -0
- app/generation/response_generator.py +0 -387
- app/ingestion/__pycache__/chunker.cpython-312.pyc +0 -0
- app/ingestion/__pycache__/document_parser.cpython-312.pyc +0 -0
- app/ingestion/__pycache__/file_scanner.cpython-312.pyc +0 -0
- app/ingestion/chunker.py +0 -291
- app/ingestion/document_parser.py +0 -146
- app/ingestion/file_scanner.py +0 -92
- app/main.py +0 -44
- app/models/__pycache__/chat.cpython-312.pyc +0 -0
- app/models/chat.py +0 -69
- app/prompting/__pycache__/context_filter.cpython-312.pyc +0 -0
- app/prompting/__pycache__/prompt_builder.cpython-312.pyc +0 -0
- app/prompting/context_filter.py +0 -205
- app/prompting/prompt_builder.py +0 -187
- app/retrieval/__pycache__/retriever.cpython-312.pyc +0 -0
- app/retrieval/__pycache__/vector_search.cpython-312.pyc +0 -0
- app/retrieval/retriever.py +0 -149
- app/retrieval/vector_search.py +0 -103
- app/services/__pycache__/chat_service.cpython-312.pyc +0 -0
- app/services/__pycache__/gemini_client.cpython-312.pyc +0 -0
- app/services/__pycache__/openrouter_client.cpython-312.pyc +0 -0
- app/services/chat_service.py +0 -144
- app/services/openrouter_client.py +0 -165
- app/vector_store/__pycache__/qdrant_client.cpython-312.pyc +0 -0
- app/vector_store/__pycache__/vector_repository.cpython-312.pyc +0 -0
- app/vector_store/qdrant_client.py +0 -207
- app/vector_store/vector_repository.py +0 -49
Dockerfile
CHANGED
|
@@ -1,44 +1,12 @@
|
|
| 1 |
-
# Use Python 3.11 slim image as base
|
| 2 |
FROM python:3.11-slim
|
| 3 |
|
| 4 |
-
# Set environment variables
|
| 5 |
-
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 6 |
-
PYTHONUNBUFFERED=1 \
|
| 7 |
-
PYTHONPATH=/app \
|
| 8 |
-
PORT=7860
|
| 9 |
-
|
| 10 |
-
# Set work directory
|
| 11 |
WORKDIR /app
|
| 12 |
|
| 13 |
-
# Install system dependencies
|
| 14 |
-
RUN apt-get update \
|
| 15 |
-
&& apt-get install -y --no-install-recommends \
|
| 16 |
-
build-essential \
|
| 17 |
-
gcc \
|
| 18 |
-
curl \
|
| 19 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 20 |
-
|
| 21 |
-
# Copy requirements first to leverage Docker cache
|
| 22 |
COPY requirements.txt .
|
|
|
|
| 23 |
|
| 24 |
-
# Install Python dependencies
|
| 25 |
-
RUN pip install --no-cache-dir --upgrade pip \
|
| 26 |
-
&& pip install --no-cache-dir -r requirements.txt
|
| 27 |
-
|
| 28 |
-
# Copy the rest of the application
|
| 29 |
COPY . .
|
| 30 |
|
| 31 |
-
|
| 32 |
-
RUN adduser --disabled-password --gecos '' appuser \
|
| 33 |
-
&& chown -R appuser:appuser /app
|
| 34 |
-
USER appuser
|
| 35 |
-
|
| 36 |
-
# Expose port (Hugging Face typically uses port 7860 or 8080)
|
| 37 |
-
EXPOSE $PORT
|
| 38 |
-
|
| 39 |
-
# Health check endpoint
|
| 40 |
-
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
| 41 |
-
CMD curl -f http://localhost:$PORT/health || exit 1
|
| 42 |
|
| 43 |
-
|
| 44 |
-
CMD ["sh", "-c", "uvicorn app.main:app --host 0.0.0.0 --port $PORT"]
|
|
|
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
COPY . .
|
| 9 |
|
| 10 |
+
EXPOSE 7860
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
agent.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Dict, List, Any
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
from agents import Agent, Runner
|
| 7 |
+
from agents import function_tool
|
| 8 |
+
import asyncio
|
| 9 |
+
import time
|
| 10 |
+
|
| 11 |
+
# Load environment variables
|
| 12 |
+
load_dotenv()
|
| 13 |
+
|
| 14 |
+
# Configure logging
|
| 15 |
+
logging.basicConfig(level=logging.INFO)
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
@function_tool
|
| 19 |
+
def retrieve_information(query: str) -> Dict:
|
| 20 |
+
"""
|
| 21 |
+
Retrieve information from the knowledge base based on a query
|
| 22 |
+
"""
|
| 23 |
+
from retrieving import RAGRetriever
|
| 24 |
+
retriever = RAGRetriever()
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
# Call the existing retrieve method from the RAGRetriever instance
|
| 28 |
+
json_response = retriever.retrieve(query_text=query, top_k=5, threshold=0.3)
|
| 29 |
+
results = json.loads(json_response)
|
| 30 |
+
|
| 31 |
+
# Format the results for the assistant
|
| 32 |
+
formatted_results = []
|
| 33 |
+
for result in results.get('results', []):
|
| 34 |
+
formatted_results.append({
|
| 35 |
+
'content': result['content'],
|
| 36 |
+
'url': result['url'],
|
| 37 |
+
'position': result['position'],
|
| 38 |
+
'similarity_score': result['similarity_score']
|
| 39 |
+
})
|
| 40 |
+
|
| 41 |
+
return {
|
| 42 |
+
'query': query,
|
| 43 |
+
'retrieved_chunks': formatted_results,
|
| 44 |
+
'total_results': len(formatted_results)
|
| 45 |
+
}
|
| 46 |
+
except Exception as e:
|
| 47 |
+
logger.error(f"Error in retrieve_information: {e}")
|
| 48 |
+
return {
|
| 49 |
+
'query': query,
|
| 50 |
+
'retrieved_chunks': [],
|
| 51 |
+
'total_results': 0,
|
| 52 |
+
'error': str(e)
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
class RAGAgent:
|
| 56 |
+
def __init__(self):
|
| 57 |
+
# Create the agent with retrieval tool using the new OpenAI Agents SDK
|
| 58 |
+
self.agent = Agent(
|
| 59 |
+
name="RAG Assistant",
|
| 60 |
+
instructions="You are a helpful assistant that answers questions based on retrieved documents. When asked a question, retrieve relevant documents first using the retrieve_information tool, then answer based on them. Always cite your sources and provide the information that was used to generate the answer.",
|
| 61 |
+
tools=[retrieve_information]
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
logger.info("RAG Agent initialized with OpenAI Agents SDK")
|
| 65 |
+
|
| 66 |
+
def query_agent(self, query_text: str) -> Dict:
|
| 67 |
+
"""
|
| 68 |
+
Process a query through the RAG agent and return structured response
|
| 69 |
+
"""
|
| 70 |
+
start_time = time.time()
|
| 71 |
+
|
| 72 |
+
logger.info(f"Processing query through RAG agent: '{query_text[:50]}...'")
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
# Run the agent with the query using the new OpenAI Agents SDK
|
| 76 |
+
# Since Runner.run is async, we need to run it in an event loop
|
| 77 |
+
import asyncio
|
| 78 |
+
if asyncio.get_event_loop().is_running():
|
| 79 |
+
# If we're already in an event loop, we need to use a different approach
|
| 80 |
+
import concurrent.futures
|
| 81 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 82 |
+
future = executor.submit(asyncio.run, self._async_query_agent(query_text))
|
| 83 |
+
result = future.result()
|
| 84 |
+
else:
|
| 85 |
+
result = asyncio.run(self._async_query_agent(query_text))
|
| 86 |
+
|
| 87 |
+
return result
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.error(f"Error processing query: {e}")
|
| 91 |
+
return {
|
| 92 |
+
"answer": "Sorry, I encountered an error processing your request.",
|
| 93 |
+
"sources": [],
|
| 94 |
+
"matched_chunks": [],
|
| 95 |
+
"error": str(e),
|
| 96 |
+
"query_time_ms": (time.time() - start_time) * 1000
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
async def _async_query_agent(self, query_text: str) -> Dict:
|
| 100 |
+
"""
|
| 101 |
+
Internal async method to run the agent query
|
| 102 |
+
"""
|
| 103 |
+
start_time = time.time()
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
result = await Runner.run(self.agent, query_text)
|
| 107 |
+
|
| 108 |
+
# Extract the assistant's response
|
| 109 |
+
assistant_response = result.final_output
|
| 110 |
+
|
| 111 |
+
if not assistant_response:
|
| 112 |
+
return {
|
| 113 |
+
"answer": "Sorry, I couldn't generate a response.",
|
| 114 |
+
"sources": [],
|
| 115 |
+
"matched_chunks": [],
|
| 116 |
+
"error": "No response from assistant",
|
| 117 |
+
"query_time_ms": (time.time() - start_time) * 1000
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
# Extract sources and matched chunks from the tool calls
|
| 121 |
+
sources = set()
|
| 122 |
+
matched_chunks = []
|
| 123 |
+
|
| 124 |
+
# The new SDK might store tool call results differently
|
| 125 |
+
# Let's try to access them in the most likely way based on the documentation
|
| 126 |
+
if hasattr(result, 'final_output') and result.final_output:
|
| 127 |
+
# If the result contains tool call results in final_output
|
| 128 |
+
# For now, we'll rely on the agent's processing of the tool results
|
| 129 |
+
# The agent itself will incorporate the tool results into the final response
|
| 130 |
+
pass
|
| 131 |
+
|
| 132 |
+
# Calculate query time
|
| 133 |
+
query_time_ms = (time.time() - start_time) * 1000
|
| 134 |
+
|
| 135 |
+
# Format the response
|
| 136 |
+
# For the new SDK, we may need to extract the sources and chunks differently
|
| 137 |
+
# based on how the agent processes the tool results
|
| 138 |
+
response = {
|
| 139 |
+
"answer": str(assistant_response),
|
| 140 |
+
"sources": list(sources),
|
| 141 |
+
"matched_chunks": matched_chunks,
|
| 142 |
+
"query_time_ms": query_time_ms,
|
| 143 |
+
"confidence": self._calculate_confidence(matched_chunks)
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
logger.info(f"Query processed in {query_time_ms:.2f}ms")
|
| 147 |
+
return response
|
| 148 |
+
|
| 149 |
+
except Exception as e:
|
| 150 |
+
logger.error(f"Error in async query: {e}")
|
| 151 |
+
raise
|
| 152 |
+
|
| 153 |
+
def _calculate_confidence(self, matched_chunks: List[Dict]) -> str:
|
| 154 |
+
"""
|
| 155 |
+
Calculate confidence level based on similarity scores and number of matches
|
| 156 |
+
"""
|
| 157 |
+
if not matched_chunks:
|
| 158 |
+
return "low"
|
| 159 |
+
|
| 160 |
+
avg_score = sum(chunk.get('similarity_score', 0.0) for chunk in matched_chunks) / len(matched_chunks)
|
| 161 |
+
|
| 162 |
+
if avg_score >= 0.7:
|
| 163 |
+
return "high"
|
| 164 |
+
elif avg_score >= 0.4:
|
| 165 |
+
return "medium"
|
| 166 |
+
else:
|
| 167 |
+
return "low"
|
| 168 |
+
|
| 169 |
+
def query_agent(query_text: str) -> Dict:
|
| 170 |
+
"""
|
| 171 |
+
Convenience function to query the RAG agent
|
| 172 |
+
"""
|
| 173 |
+
agent = RAGAgent()
|
| 174 |
+
return agent.query_agent(query_text)
|
| 175 |
+
|
| 176 |
+
def run_agent_sync(query_text: str) -> Dict:
|
| 177 |
+
"""
|
| 178 |
+
Synchronous function to run the agent for direct usage
|
| 179 |
+
"""
|
| 180 |
+
import asyncio
|
| 181 |
+
|
| 182 |
+
async def run_async():
|
| 183 |
+
agent = RAGAgent()
|
| 184 |
+
return await agent._async_query_agent(query_text)
|
| 185 |
+
|
| 186 |
+
# Check if there's already a running event loop
|
| 187 |
+
try:
|
| 188 |
+
loop = asyncio.get_running_loop()
|
| 189 |
+
# If there's already a loop, run in a separate thread
|
| 190 |
+
import concurrent.futures
|
| 191 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 192 |
+
future = executor.submit(asyncio.run, run_async())
|
| 193 |
+
return future.result()
|
| 194 |
+
except RuntimeError:
|
| 195 |
+
# No running loop, safe to use asyncio.run
|
| 196 |
+
return asyncio.run(run_async())
|
| 197 |
+
|
| 198 |
+
def main():
|
| 199 |
+
"""
|
| 200 |
+
Main function to demonstrate the RAG agent functionality
|
| 201 |
+
"""
|
| 202 |
+
logger.info("Initializing RAG Agent...")
|
| 203 |
+
|
| 204 |
+
# Initialize the agent
|
| 205 |
+
agent = RAGAgent()
|
| 206 |
+
|
| 207 |
+
# Example queries to test the system
|
| 208 |
+
test_queries = [
|
| 209 |
+
"What is ROS2?",
|
| 210 |
+
"Explain humanoid design principles",
|
| 211 |
+
"How does VLA work?",
|
| 212 |
+
"What are simulation techniques?",
|
| 213 |
+
"Explain AI control systems"
|
| 214 |
+
]
|
| 215 |
+
|
| 216 |
+
print("RAG Agent - Testing Queries")
|
| 217 |
+
print("=" * 50)
|
| 218 |
+
|
| 219 |
+
for i, query in enumerate(test_queries, 1):
|
| 220 |
+
print(f"\nQuery {i}: {query}")
|
| 221 |
+
print("-" * 30)
|
| 222 |
+
|
| 223 |
+
# Process query through agent
|
| 224 |
+
response = agent.query_agent(query)
|
| 225 |
+
|
| 226 |
+
# Print formatted results
|
| 227 |
+
print(f"Answer: {response['answer']}")
|
| 228 |
+
|
| 229 |
+
if response.get('sources'):
|
| 230 |
+
print(f"Sources: {len(response['sources'])} documents")
|
| 231 |
+
for source in response['sources'][:3]: # Show first 3 sources
|
| 232 |
+
print(f" - {source}")
|
| 233 |
+
|
| 234 |
+
if response.get('matched_chunks'):
|
| 235 |
+
print(f"Matched chunks: {len(response['matched_chunks'])}")
|
| 236 |
+
for j, chunk in enumerate(response['matched_chunks'][:2], 1): # Show first 2 chunks
|
| 237 |
+
content_preview = chunk['content'][:100] + "..." if len(chunk['content']) > 100 else chunk['content']
|
| 238 |
+
print(f" Chunk {j}: {content_preview}")
|
| 239 |
+
print(f" Source: {chunk['url']}")
|
| 240 |
+
print(f" Score: {chunk['similarity_score']:.3f}")
|
| 241 |
+
|
| 242 |
+
print(f"Query time: {response['query_time_ms']:.2f}ms")
|
| 243 |
+
print(f"Confidence: {response.get('confidence', 'unknown')}")
|
| 244 |
+
|
| 245 |
+
if i < len(test_queries): # Don't sleep after the last query
|
| 246 |
+
time.sleep(1) # Small delay between queries
|
| 247 |
+
|
| 248 |
+
if __name__ == "__main__":
|
| 249 |
+
main()
|
api.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import asyncio
|
| 3 |
+
from fastapi import FastAPI, HTTPException
|
| 4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
from typing import List, Optional, Dict
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
# Configure logging
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# Import the existing RAG agent functionality
|
| 18 |
+
from agent import RAGAgent
|
| 19 |
+
|
| 20 |
+
# Create FastAPI app
|
| 21 |
+
app = FastAPI(
|
| 22 |
+
title="RAG Agent API",
|
| 23 |
+
description="API for RAG Agent with document retrieval and question answering",
|
| 24 |
+
version="1.0.0"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Add CORS middleware for development
|
| 28 |
+
app.add_middleware(
|
| 29 |
+
CORSMiddleware,
|
| 30 |
+
allow_origins=["*"], # In production, replace with specific origins
|
| 31 |
+
allow_credentials=True,
|
| 32 |
+
allow_methods=["*"],
|
| 33 |
+
allow_headers=["*"],
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Pydantic models
|
| 37 |
+
class QueryRequest(BaseModel):
|
| 38 |
+
query: str
|
| 39 |
+
|
| 40 |
+
class ChatRequest(BaseModel):
|
| 41 |
+
query: str
|
| 42 |
+
message: str
|
| 43 |
+
session_id: str
|
| 44 |
+
selected_text: Optional[str] = None
|
| 45 |
+
query_type: str = "global"
|
| 46 |
+
top_k: int = 5
|
| 47 |
+
|
| 48 |
+
class MatchedChunk(BaseModel):
|
| 49 |
+
content: str
|
| 50 |
+
url: str
|
| 51 |
+
position: int
|
| 52 |
+
similarity_score: float
|
| 53 |
+
|
| 54 |
+
class QueryResponse(BaseModel):
|
| 55 |
+
answer: str
|
| 56 |
+
sources: List[str]
|
| 57 |
+
matched_chunks: List[MatchedChunk]
|
| 58 |
+
error: Optional[str] = None
|
| 59 |
+
status: str # "success", "error", "empty"
|
| 60 |
+
query_time_ms: Optional[float] = None
|
| 61 |
+
confidence: Optional[str] = None
|
| 62 |
+
|
| 63 |
+
class ChatResponse(BaseModel):
|
| 64 |
+
response: str
|
| 65 |
+
citations: List[Dict[str, str]]
|
| 66 |
+
session_id: str
|
| 67 |
+
query_type: str
|
| 68 |
+
timestamp: str
|
| 69 |
+
|
| 70 |
+
class HealthResponse(BaseModel):
|
| 71 |
+
status: str
|
| 72 |
+
message: str
|
| 73 |
+
|
| 74 |
+
# Global RAG agent instance
|
| 75 |
+
rag_agent = None
|
| 76 |
+
|
| 77 |
+
@app.on_event("startup")
|
| 78 |
+
async def startup_event():
|
| 79 |
+
"""Initialize the RAG agent on startup"""
|
| 80 |
+
global rag_agent
|
| 81 |
+
logger.info("Initializing RAG Agent...")
|
| 82 |
+
try:
|
| 83 |
+
rag_agent = RAGAgent()
|
| 84 |
+
logger.info("RAG Agent initialized successfully")
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.error(f"Failed to initialize RAG Agent: {e}")
|
| 87 |
+
raise
|
| 88 |
+
|
| 89 |
+
@app.post("/ask", response_model=QueryResponse)
|
| 90 |
+
async def ask_rag(request: QueryRequest):
|
| 91 |
+
"""
|
| 92 |
+
Process a user query through the RAG agent and return the response
|
| 93 |
+
"""
|
| 94 |
+
logger.info(f"Processing query: {request.query[:50]}...")
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
# Validate input
|
| 98 |
+
if not request.query or len(request.query.strip()) == 0:
|
| 99 |
+
raise HTTPException(status_code=400, detail="Query cannot be empty")
|
| 100 |
+
|
| 101 |
+
if len(request.query) > 2000:
|
| 102 |
+
raise HTTPException(status_code=400, detail="Query too long, maximum 2000 characters")
|
| 103 |
+
|
| 104 |
+
# Process query through RAG agent
|
| 105 |
+
response = rag_agent.query_agent(request.query)
|
| 106 |
+
|
| 107 |
+
# Format response
|
| 108 |
+
formatted_response = QueryResponse(
|
| 109 |
+
answer=response.get("answer", ""),
|
| 110 |
+
sources=response.get("sources", []),
|
| 111 |
+
matched_chunks=[
|
| 112 |
+
MatchedChunk(
|
| 113 |
+
content=chunk.get("content", ""),
|
| 114 |
+
url=chunk.get("url", ""),
|
| 115 |
+
position=chunk.get("position", 0),
|
| 116 |
+
similarity_score=chunk.get("similarity_score", 0.0)
|
| 117 |
+
)
|
| 118 |
+
for chunk in response.get("matched_chunks", [])
|
| 119 |
+
],
|
| 120 |
+
error=response.get("error"),
|
| 121 |
+
status="error" if response.get("error") else "success",
|
| 122 |
+
query_time_ms=response.get("query_time_ms"),
|
| 123 |
+
confidence=response.get("confidence")
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
logger.info(f"Query processed successfully in {response.get('query_time_ms', 0):.2f}ms")
|
| 127 |
+
return formatted_response
|
| 128 |
+
|
| 129 |
+
except HTTPException:
|
| 130 |
+
raise
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"Error processing query: {e}")
|
| 133 |
+
return QueryResponse(
|
| 134 |
+
answer="",
|
| 135 |
+
sources=[],
|
| 136 |
+
matched_chunks=[],
|
| 137 |
+
error=str(e),
|
| 138 |
+
status="error"
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
@app.post("/api", response_model=ChatResponse)
|
| 142 |
+
async def chat_endpoint(request: ChatRequest):
|
| 143 |
+
"""
|
| 144 |
+
Main chat endpoint that handles conversation with RAG capabilities
|
| 145 |
+
"""
|
| 146 |
+
logger.info(f"Processing chat query: {request.query[:50]}...")
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
# Validate input
|
| 150 |
+
if not request.query or len(request.query.strip()) == 0:
|
| 151 |
+
raise HTTPException(status_code=400, detail="Query cannot be empty")
|
| 152 |
+
|
| 153 |
+
if not request.session_id or len(request.session_id.strip()) == 0:
|
| 154 |
+
raise HTTPException(status_code=400, detail="Session ID cannot be empty")
|
| 155 |
+
|
| 156 |
+
if len(request.query) > 2000:
|
| 157 |
+
raise HTTPException(status_code=400, detail="Query too long, maximum 2000 characters")
|
| 158 |
+
|
| 159 |
+
# Process query through RAG agent
|
| 160 |
+
response = rag_agent.query_agent(request.query)
|
| 161 |
+
|
| 162 |
+
# Format response to match expected structure
|
| 163 |
+
from datetime import datetime
|
| 164 |
+
timestamp = datetime.utcnow().isoformat()
|
| 165 |
+
|
| 166 |
+
# Convert matched chunks to citations format
|
| 167 |
+
citations = []
|
| 168 |
+
for chunk in response.get("matched_chunks", []):
|
| 169 |
+
citation = {
|
| 170 |
+
"document_id": "",
|
| 171 |
+
"title": chunk.get("url", ""),
|
| 172 |
+
"chapter": "",
|
| 173 |
+
"section": "",
|
| 174 |
+
"page_reference": ""
|
| 175 |
+
}
|
| 176 |
+
citations.append(citation)
|
| 177 |
+
|
| 178 |
+
formatted_response = ChatResponse(
|
| 179 |
+
response=response.get("answer", ""),
|
| 180 |
+
citations=citations,
|
| 181 |
+
session_id=request.session_id,
|
| 182 |
+
query_type=request.query_type,
|
| 183 |
+
timestamp=timestamp
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
logger.info(f"Chat query processed successfully")
|
| 187 |
+
return formatted_response
|
| 188 |
+
|
| 189 |
+
except HTTPException:
|
| 190 |
+
raise
|
| 191 |
+
except Exception as e:
|
| 192 |
+
logger.error(f"Error processing chat query: {e}")
|
| 193 |
+
from datetime import datetime
|
| 194 |
+
return ChatResponse(
|
| 195 |
+
response="",
|
| 196 |
+
citations=[],
|
| 197 |
+
session_id=request.session_id,
|
| 198 |
+
query_type=request.query_type,
|
| 199 |
+
timestamp=datetime.utcnow().isoformat()
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
@app.get("/health", response_model=HealthResponse)
|
| 203 |
+
async def health_check():
|
| 204 |
+
"""
|
| 205 |
+
Health check endpoint
|
| 206 |
+
"""
|
| 207 |
+
return HealthResponse(
|
| 208 |
+
status="healthy",
|
| 209 |
+
message="RAG Agent API is running"
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# For running with uvicorn
|
| 213 |
+
if __name__ == "__main__":
|
| 214 |
+
import uvicorn
|
| 215 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
app/__init__.py
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
RAG Chatbot Backend Application
|
| 3 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
app/__pycache__/__init__.cpython-312.pyc
DELETED
|
Binary file (219 Bytes)
|
|
|
app/__pycache__/config.cpython-312.pyc
DELETED
|
Binary file (1.58 kB)
|
|
|
app/__pycache__/main.cpython-312.pyc
DELETED
|
Binary file (1.96 kB)
|
|
|
app/api/__pycache__/chat.cpython-312.pyc
DELETED
|
Binary file (9.58 kB)
|
|
|
app/api/__pycache__/health.cpython-312.pyc
DELETED
|
Binary file (6.67 kB)
|
|
|
app/api/__pycache__/ingest.cpython-312.pyc
DELETED
|
Binary file (11 kB)
|
|
|
app/api/chat.py
DELETED
|
@@ -1,249 +0,0 @@
|
|
| 1 |
-
from fastapi import APIRouter, HTTPException, Depends, Header
|
| 2 |
-
from typing import Dict, Any, List, Optional
|
| 3 |
-
from pydantic import BaseModel
|
| 4 |
-
from datetime import datetime
|
| 5 |
-
import uuid
|
| 6 |
-
import asyncio
|
| 7 |
-
from functools import lru_cache
|
| 8 |
-
|
| 9 |
-
from app.retrieval.retriever import retriever
|
| 10 |
-
from app.prompting.context_filter import context_filter
|
| 11 |
-
from app.generation.response_generator import response_generator
|
| 12 |
-
from app.database.repositories import ChatSessionRepository, ChatMessageRepository, QueryContextRepository
|
| 13 |
-
from app.database.database import get_db
|
| 14 |
-
from app.config import settings
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
router = APIRouter()
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
class ChatRequest(BaseModel):
|
| 21 |
-
session_id: str
|
| 22 |
-
message: str
|
| 23 |
-
selected_text: Optional[str] = None
|
| 24 |
-
query_type: str = "global" # "global" or "selection"
|
| 25 |
-
top_k: int = 5
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
class ChatResponse(BaseModel):
|
| 29 |
-
response: str
|
| 30 |
-
citations: List[Dict[str, str]]
|
| 31 |
-
session_id: str
|
| 32 |
-
query_type: str
|
| 33 |
-
timestamp: str
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
class ChatMessage(BaseModel):
|
| 37 |
-
message_id: str
|
| 38 |
-
session_id: str
|
| 39 |
-
role: str # "user" or "assistant"
|
| 40 |
-
content: str
|
| 41 |
-
citations: Optional[List[Dict[str, str]]] = None
|
| 42 |
-
timestamp: str
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
@router.post("", response_model=ChatResponse)
|
| 46 |
-
async def chat_endpoint(
|
| 47 |
-
request: ChatRequest,
|
| 48 |
-
x_api_key: str = Header(None)
|
| 49 |
-
):
|
| 50 |
-
"""
|
| 51 |
-
Main chat endpoint that handles conversation with RAG capabilities
|
| 52 |
-
"""
|
| 53 |
-
# Validate API key if configured
|
| 54 |
-
if settings.BACKEND_API_KEY and x_api_key != settings.BACKEND_API_KEY:
|
| 55 |
-
raise HTTPException(status_code=401, detail="Invalid API key")
|
| 56 |
-
|
| 57 |
-
try:
|
| 58 |
-
# Validate query type
|
| 59 |
-
if request.query_type not in ["global", "selection"]:
|
| 60 |
-
raise HTTPException(
|
| 61 |
-
status_code=400,
|
| 62 |
-
detail="query_type must be either 'global' or 'selection'"
|
| 63 |
-
)
|
| 64 |
-
|
| 65 |
-
# Retrieve relevant documents based on query and query type
|
| 66 |
-
retrieved_docs = await retriever.retrieve_with_context_filtering(
|
| 67 |
-
query=request.message,
|
| 68 |
-
top_k=request.top_k,
|
| 69 |
-
query_type=request.query_type,
|
| 70 |
-
selected_text=request.selected_text
|
| 71 |
-
)
|
| 72 |
-
|
| 73 |
-
# Apply context filtering to ensure proper isolation
|
| 74 |
-
filtered_docs = context_filter.enforce_context_boundaries(
|
| 75 |
-
contexts=retrieved_docs,
|
| 76 |
-
query_type=request.query_type,
|
| 77 |
-
selected_text=request.selected_text
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
# Generate response using Claude
|
| 81 |
-
response_data = await response_generator.generate_response_with_validation(
|
| 82 |
-
query=request.message,
|
| 83 |
-
retrieved_contexts=filtered_docs,
|
| 84 |
-
query_type=request.query_type,
|
| 85 |
-
selected_text=request.selected_text,
|
| 86 |
-
session_id=request.session_id
|
| 87 |
-
)
|
| 88 |
-
|
| 89 |
-
# Store the conversation in the database
|
| 90 |
-
db_gen = get_db()
|
| 91 |
-
db = next(db_gen)
|
| 92 |
-
try:
|
| 93 |
-
# Create or update session
|
| 94 |
-
session_repo = ChatSessionRepository(db)
|
| 95 |
-
existing_session = session_repo.get_session_by_id(request.session_id)
|
| 96 |
-
if not existing_session:
|
| 97 |
-
session_repo.create_session(session_id=request.session_id)
|
| 98 |
-
|
| 99 |
-
# Store user message
|
| 100 |
-
user_message_id = f"msg_{uuid.uuid4().hex[:8]}"
|
| 101 |
-
message_repo = ChatMessageRepository(db)
|
| 102 |
-
message_repo.create_message(
|
| 103 |
-
message_id=user_message_id,
|
| 104 |
-
session_id=request.session_id,
|
| 105 |
-
role="user",
|
| 106 |
-
content=request.message
|
| 107 |
-
)
|
| 108 |
-
|
| 109 |
-
# Store assistant response
|
| 110 |
-
assistant_message_id = f"msg_{uuid.uuid4().hex[:8]}"
|
| 111 |
-
citations_for_storage = response_data.get("citations", [])
|
| 112 |
-
message_repo.create_message(
|
| 113 |
-
message_id=assistant_message_id,
|
| 114 |
-
session_id=request.session_id,
|
| 115 |
-
role="assistant",
|
| 116 |
-
content=response_data.get("response", ""),
|
| 117 |
-
citations=citations_for_storage
|
| 118 |
-
)
|
| 119 |
-
finally:
|
| 120 |
-
next(db_gen, None) # Close the db session
|
| 121 |
-
|
| 122 |
-
return ChatResponse(
|
| 123 |
-
response=response_data.get("response", ""),
|
| 124 |
-
citations=response_data.get("citations", []),
|
| 125 |
-
session_id=request.session_id,
|
| 126 |
-
query_type=request.query_type,
|
| 127 |
-
timestamp=datetime.utcnow().isoformat()
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
-
except HTTPException:
|
| 131 |
-
raise
|
| 132 |
-
except Exception as e:
|
| 133 |
-
raise HTTPException(
|
| 134 |
-
status_code=500,
|
| 135 |
-
detail=f"Error processing chat request: {str(e)}"
|
| 136 |
-
)
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
@router.post("/stream")
|
| 140 |
-
async def chat_stream_endpoint(request: ChatRequest):
|
| 141 |
-
"""
|
| 142 |
-
Streaming chat endpoint (placeholder - actual streaming implementation would be more complex)
|
| 143 |
-
"""
|
| 144 |
-
# For now, this just calls the regular endpoint
|
| 145 |
-
# In a production implementation, you would use FastAPI's StreamingResponse
|
| 146 |
-
result = await chat_endpoint(request, None)
|
| 147 |
-
return result
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
@router.get("/session/{session_id}")
|
| 151 |
-
async def get_session_history(session_id: str):
|
| 152 |
-
"""
|
| 153 |
-
Retrieve chat history for a specific session
|
| 154 |
-
"""
|
| 155 |
-
try:
|
| 156 |
-
db_gen = get_db()
|
| 157 |
-
db = next(db_gen)
|
| 158 |
-
try:
|
| 159 |
-
message_repo = ChatMessageRepository(db)
|
| 160 |
-
messages = message_repo.get_messages_by_session(session_id)
|
| 161 |
-
|
| 162 |
-
return {
|
| 163 |
-
"session_id": session_id,
|
| 164 |
-
"messages": [
|
| 165 |
-
{
|
| 166 |
-
"message_id": msg.message_id,
|
| 167 |
-
"role": msg.role,
|
| 168 |
-
"content": msg.content,
|
| 169 |
-
"timestamp": msg.timestamp.isoformat() if msg.timestamp else None,
|
| 170 |
-
"citations": msg.citations
|
| 171 |
-
}
|
| 172 |
-
for msg in messages
|
| 173 |
-
],
|
| 174 |
-
"timestamp": datetime.utcnow().isoformat()
|
| 175 |
-
}
|
| 176 |
-
finally:
|
| 177 |
-
next(db_gen, None)
|
| 178 |
-
except Exception as e:
|
| 179 |
-
raise HTTPException(
|
| 180 |
-
status_code=500,
|
| 181 |
-
detail=f"Error retrieving session history: {str(e)}"
|
| 182 |
-
)
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
@router.delete("/session/{session_id}")
|
| 186 |
-
async def delete_session(session_id: str):
|
| 187 |
-
"""
|
| 188 |
-
Delete a chat session and all associated messages
|
| 189 |
-
"""
|
| 190 |
-
try:
|
| 191 |
-
# In a real implementation, you would have a method to delete all messages
|
| 192 |
-
# associated with a session. For now, we'll just return a success message.
|
| 193 |
-
return {
|
| 194 |
-
"status": "deleted",
|
| 195 |
-
"session_id": session_id,
|
| 196 |
-
"timestamp": datetime.utcnow().isoformat()
|
| 197 |
-
}
|
| 198 |
-
except Exception as e:
|
| 199 |
-
raise HTTPException(
|
| 200 |
-
status_code=500,
|
| 201 |
-
detail=f"Error deleting session: {str(e)}"
|
| 202 |
-
)
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
@router.post("/validate")
|
| 206 |
-
async def validate_query(request: ChatRequest):
|
| 207 |
-
"""
|
| 208 |
-
Validate a query without generating a response (for testing purposes)
|
| 209 |
-
"""
|
| 210 |
-
try:
|
| 211 |
-
# Validate query type
|
| 212 |
-
if request.query_type not in ["global", "selection"]:
|
| 213 |
-
return {"valid": False, "error": "query_type must be either 'global' or 'selection'"}
|
| 214 |
-
|
| 215 |
-
# Validate that if query_type is 'selection', selected_text is provided
|
| 216 |
-
if request.query_type == "selection" and not request.selected_text:
|
| 217 |
-
return {"valid": False, "error": "selected_text is required for selection-based queries"}
|
| 218 |
-
|
| 219 |
-
# Check if we can retrieve relevant documents
|
| 220 |
-
retrieved_docs = await retriever.retrieve_relevant_documents(
|
| 221 |
-
query=request.message,
|
| 222 |
-
top_k=request.top_k,
|
| 223 |
-
query_type=request.query_type,
|
| 224 |
-
selected_text=request.selected_text
|
| 225 |
-
)
|
| 226 |
-
|
| 227 |
-
return {
|
| 228 |
-
"valid": True,
|
| 229 |
-
"query_type": request.query_type,
|
| 230 |
-
"documents_found": len(retrieved_docs),
|
| 231 |
-
"has_context": len(retrieved_docs) > 0
|
| 232 |
-
}
|
| 233 |
-
except Exception as e:
|
| 234 |
-
raise HTTPException(
|
| 235 |
-
status_code=500,
|
| 236 |
-
detail=f"Error validating query: {str(e)}"
|
| 237 |
-
)
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
# Rate limiting middleware would be implemented here in a production system
|
| 241 |
-
# For now, we'll add a simple rate limiting check based on settings
|
| 242 |
-
async def check_rate_limit(session_id: str) -> bool:
|
| 243 |
-
"""
|
| 244 |
-
Check if the session has exceeded rate limits
|
| 245 |
-
This is a simplified implementation - a production system would use Redis or similar
|
| 246 |
-
"""
|
| 247 |
-
# In a real implementation, you would check against a rate limit store
|
| 248 |
-
# For now, we'll just return True to allow all requests
|
| 249 |
-
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/api/health.py
DELETED
|
@@ -1,160 +0,0 @@
|
|
| 1 |
-
from fastapi import APIRouter, HTTPException, Depends
|
| 2 |
-
from typing import Dict, Any
|
| 3 |
-
from datetime import datetime
|
| 4 |
-
import time
|
| 5 |
-
import asyncio
|
| 6 |
-
|
| 7 |
-
from app.config import settings
|
| 8 |
-
from app.vector_store.qdrant_client import qdrant_client
|
| 9 |
-
from app.database.models import create_tables, get_db
|
| 10 |
-
from app.database.database import get_db
|
| 11 |
-
from sqlalchemy.orm import Session
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
router = APIRouter()
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
@router.get("/health", response_model=Dict[str, Any])
|
| 18 |
-
async def health_check():
|
| 19 |
-
"""
|
| 20 |
-
Health check endpoint that verifies the system is operational
|
| 21 |
-
"""
|
| 22 |
-
# Basic system status
|
| 23 |
-
health_status = {
|
| 24 |
-
"status": "healthy",
|
| 25 |
-
"timestamp": datetime.utcnow().isoformat(),
|
| 26 |
-
"service": "RAG Chatbot API",
|
| 27 |
-
"version": "1.0.0",
|
| 28 |
-
"checks": {
|
| 29 |
-
"database": {"status": "unknown", "message": ""},
|
| 30 |
-
"vector_store": {"status": "unknown", "message": ""},
|
| 31 |
-
"api_config": {"status": "ok", "message": "API configuration loaded"},
|
| 32 |
-
"external_services": {"status": "unknown", "message": ""}
|
| 33 |
-
}
|
| 34 |
-
}
|
| 35 |
-
|
| 36 |
-
# Check database connection
|
| 37 |
-
try:
|
| 38 |
-
# Try to access the database
|
| 39 |
-
from app.database.database import engine
|
| 40 |
-
with engine.connect() as conn:
|
| 41 |
-
# Simple query to test connection using SQLAlchemy 2.0 syntax
|
| 42 |
-
from sqlalchemy import text
|
| 43 |
-
result = conn.execute(text("SELECT 1"))
|
| 44 |
-
health_status["checks"]["database"]["status"] = "ok"
|
| 45 |
-
health_status["checks"]["database"]["message"] = "Database connection successful"
|
| 46 |
-
except Exception as e:
|
| 47 |
-
health_status["checks"]["database"]["status"] = "error"
|
| 48 |
-
health_status["checks"]["database"]["message"] = f"Database connection failed: {str(e)}"
|
| 49 |
-
|
| 50 |
-
# Check vector store connection
|
| 51 |
-
try:
|
| 52 |
-
# Try to get collection info as a simple connectivity test
|
| 53 |
-
collection_info = qdrant_client.get_collection_info()
|
| 54 |
-
health_status["checks"]["vector_store"]["status"] = "ok"
|
| 55 |
-
health_status["checks"]["vector_store"]["message"] = f"Vector store connected, {collection_info.get('point_count', 0)} points"
|
| 56 |
-
except Exception as e:
|
| 57 |
-
health_status["checks"]["vector_store"]["status"] = "error"
|
| 58 |
-
health_status["checks"]["vector_store"]["message"] = f"Vector store connection failed: {str(e)}"
|
| 59 |
-
|
| 60 |
-
# Check external services (API keys)
|
| 61 |
-
try:
|
| 62 |
-
if not settings.OPENROUTER_API_KEY or settings.OPENROUTER_API_KEY == "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":
|
| 63 |
-
health_status["checks"]["external_services"]["status"] = "warning"
|
| 64 |
-
health_status["checks"]["external_services"]["message"] = "API key not configured or using default value"
|
| 65 |
-
else:
|
| 66 |
-
health_status["checks"]["external_services"]["status"] = "ok"
|
| 67 |
-
health_status["checks"]["external_services"]["message"] = "External services configured"
|
| 68 |
-
except Exception as e:
|
| 69 |
-
health_status["checks"]["external_services"]["status"] = "error"
|
| 70 |
-
health_status["checks"]["external_services"]["message"] = f"External service config error: {str(e)}"
|
| 71 |
-
|
| 72 |
-
# Overall status based on individual checks
|
| 73 |
-
overall_status = "healthy"
|
| 74 |
-
for check_name, check in health_status["checks"].items():
|
| 75 |
-
if check["status"] == "error":
|
| 76 |
-
overall_status = "error"
|
| 77 |
-
break
|
| 78 |
-
elif check["status"] == "warning" and overall_status != "error":
|
| 79 |
-
overall_status = "warning"
|
| 80 |
-
|
| 81 |
-
health_status["status"] = overall_status
|
| 82 |
-
|
| 83 |
-
return health_status
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
@router.get("/ready", response_model=Dict[str, str])
|
| 87 |
-
async def readiness_check():
|
| 88 |
-
"""
|
| 89 |
-
Readiness check - verifies the service is ready to accept traffic
|
| 90 |
-
"""
|
| 91 |
-
# For readiness, we mainly check if critical services are available
|
| 92 |
-
try:
|
| 93 |
-
# Test database connection
|
| 94 |
-
from app.database.database import engine
|
| 95 |
-
from sqlalchemy import text
|
| 96 |
-
with engine.connect() as conn:
|
| 97 |
-
conn.execute(text("SELECT 1"))
|
| 98 |
-
|
| 99 |
-
# Test vector store connection
|
| 100 |
-
qdrant_client.get_collection_info()
|
| 101 |
-
|
| 102 |
-
return {"status": "ready"}
|
| 103 |
-
except Exception:
|
| 104 |
-
raise HTTPException(status_code=503, detail="Service not ready")
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
@router.get("/live", response_model=Dict[str, str])
|
| 108 |
-
async def liveness_check():
|
| 109 |
-
"""
|
| 110 |
-
Liveness check - verifies the service is alive and responding
|
| 111 |
-
"""
|
| 112 |
-
return {"status": "alive", "timestamp": datetime.utcnow().isoformat()}
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
@router.get("/stats", response_model=Dict[str, Any])
|
| 116 |
-
async def get_service_stats():
|
| 117 |
-
"""
|
| 118 |
-
Get detailed service statistics
|
| 119 |
-
"""
|
| 120 |
-
stats = {
|
| 121 |
-
"timestamp": datetime.utcnow().isoformat(),
|
| 122 |
-
"uptime": "tracking needed", # This would need to be implemented with a global start time
|
| 123 |
-
"database": {},
|
| 124 |
-
"vector_store": {},
|
| 125 |
-
"api_usage": {}
|
| 126 |
-
}
|
| 127 |
-
|
| 128 |
-
# Database stats
|
| 129 |
-
try:
|
| 130 |
-
from app.database.database import engine
|
| 131 |
-
with engine.connect() as conn:
|
| 132 |
-
# Get basic database info using SQLAlchemy 2.0 syntax
|
| 133 |
-
from sqlalchemy import text
|
| 134 |
-
result = conn.execute(text("SELECT COUNT(*) FROM book_content_documents"))
|
| 135 |
-
doc_count = result.scalar()
|
| 136 |
-
stats["database"] = {
|
| 137 |
-
"documents_count": doc_count,
|
| 138 |
-
"status": "connected"
|
| 139 |
-
}
|
| 140 |
-
except Exception as e:
|
| 141 |
-
stats["database"] = {"status": "error", "error": str(e)}
|
| 142 |
-
|
| 143 |
-
# Vector store stats
|
| 144 |
-
try:
|
| 145 |
-
collection_info = qdrant_client.get_collection_info()
|
| 146 |
-
stats["vector_store"] = {
|
| 147 |
-
"point_count": collection_info.get('point_count', 0),
|
| 148 |
-
"vector_size": collection_info.get('vector_size', 0),
|
| 149 |
-
"status": "connected"
|
| 150 |
-
}
|
| 151 |
-
except Exception as e:
|
| 152 |
-
stats["vector_store"] = {"status": "error", "error": str(e)}
|
| 153 |
-
|
| 154 |
-
# API usage would be tracked separately in a production system
|
| 155 |
-
stats["api_usage"] = {
|
| 156 |
-
"requests_served": 0, # This would be implemented with actual tracking
|
| 157 |
-
"avg_response_time": 0.0 # This would be calculated from actual metrics
|
| 158 |
-
}
|
| 159 |
-
|
| 160 |
-
return stats
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/api/ingest.py
DELETED
|
@@ -1,316 +0,0 @@
|
|
| 1 |
-
from fastapi import APIRouter, HTTPException, BackgroundTasks, UploadFile, File, Form
|
| 2 |
-
from typing import Dict, Any, List, Optional
|
| 3 |
-
from pydantic import BaseModel
|
| 4 |
-
from datetime import datetime
|
| 5 |
-
import asyncio
|
| 6 |
-
import os
|
| 7 |
-
from pathlib import Path
|
| 8 |
-
|
| 9 |
-
from app.ingestion.file_scanner import FileScanner
|
| 10 |
-
from app.ingestion.chunker import TextChunker, chunk_documents
|
| 11 |
-
from app.embeddings.minimal_embedding_generator import minimal_embedding_generator
|
| 12 |
-
from app.vector_store.vector_repository import vector_repository
|
| 13 |
-
from app.database.repositories import BookContentRepository
|
| 14 |
-
from app.database.database import get_db
|
| 15 |
-
from app.config import settings
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
router = APIRouter()
|
| 19 |
-
|
| 20 |
-
# Initialize components
|
| 21 |
-
chunker = TextChunker()
|
| 22 |
-
file_scanner = FileScanner()
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
class IngestRequest(BaseModel):
|
| 26 |
-
book_path: str = "docusaurus/docs" # Default path for Docusaurus docs
|
| 27 |
-
chunk_size: int = 800
|
| 28 |
-
force_reprocess: bool = False
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
class IngestResponse(BaseModel):
|
| 32 |
-
status: str
|
| 33 |
-
message: str
|
| 34 |
-
documents_processed: int
|
| 35 |
-
chunks_created: int
|
| 36 |
-
embeddings_generated: int
|
| 37 |
-
timestamp: str
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
class IngestStatusResponse(BaseModel):
|
| 41 |
-
status: str
|
| 42 |
-
progress: float
|
| 43 |
-
message: str
|
| 44 |
-
details: Dict[str, Any]
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
# In-memory storage for tracking ingestion jobs (in production, use a proper task queue)
|
| 48 |
-
ingestion_jobs: Dict[str, Dict[str, Any]] = {}
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
@router.post("/ingest", response_model=IngestResponse)
|
| 52 |
-
async def ingest_documents(request: IngestRequest, background_tasks: BackgroundTasks):
|
| 53 |
-
"""
|
| 54 |
-
Ingest book content from markdown files, process, and store in vector database
|
| 55 |
-
"""
|
| 56 |
-
try:
|
| 57 |
-
# Validate the book path exists
|
| 58 |
-
if not os.path.exists(request.book_path):
|
| 59 |
-
raise HTTPException(
|
| 60 |
-
status_code=400,
|
| 61 |
-
detail=f"Book path does not exist: {request.book_path}"
|
| 62 |
-
)
|
| 63 |
-
|
| 64 |
-
# Update chunker settings if provided
|
| 65 |
-
if request.chunk_size:
|
| 66 |
-
chunker.max_tokens = request.chunk_size
|
| 67 |
-
|
| 68 |
-
# Scan and parse documents
|
| 69 |
-
documents = file_scanner.scan_and_parse_documents()
|
| 70 |
-
|
| 71 |
-
if not documents:
|
| 72 |
-
raise HTTPException(
|
| 73 |
-
status_code=400,
|
| 74 |
-
detail=f"No markdown documents found in path: {request.book_path}"
|
| 75 |
-
)
|
| 76 |
-
|
| 77 |
-
# Validate documents
|
| 78 |
-
valid_documents = [doc for doc in documents if file_scanner.validate_document(doc)]
|
| 79 |
-
if not valid_documents:
|
| 80 |
-
raise HTTPException(
|
| 81 |
-
status_code=400,
|
| 82 |
-
detail="No valid documents found after validation"
|
| 83 |
-
)
|
| 84 |
-
|
| 85 |
-
# Chunk documents
|
| 86 |
-
all_chunks = chunk_documents(valid_documents)
|
| 87 |
-
|
| 88 |
-
if not all_chunks:
|
| 89 |
-
raise HTTPException(
|
| 90 |
-
status_code=500,
|
| 91 |
-
detail="No chunks were created from documents"
|
| 92 |
-
)
|
| 93 |
-
|
| 94 |
-
# Generate embeddings for chunks
|
| 95 |
-
chunks_with_metadata = []
|
| 96 |
-
for chunk in all_chunks:
|
| 97 |
-
# Generate embedding using minimal embedding generator
|
| 98 |
-
embedding = minimal_embedding_generator.encode_query(chunk.content)
|
| 99 |
-
if embedding:
|
| 100 |
-
chunk_data = {
|
| 101 |
-
'id': chunk.id,
|
| 102 |
-
'content': chunk.content,
|
| 103 |
-
'title': chunk.title,
|
| 104 |
-
'chapter': chunk.chapter,
|
| 105 |
-
'section': chunk.section,
|
| 106 |
-
'page_reference': chunk.page_reference,
|
| 107 |
-
'token_count': chunk.token_count,
|
| 108 |
-
'embedding': embedding
|
| 109 |
-
}
|
| 110 |
-
chunks_with_metadata.append(chunk_data)
|
| 111 |
-
else:
|
| 112 |
-
# Skip chunks that couldn't generate embeddings
|
| 113 |
-
continue
|
| 114 |
-
|
| 115 |
-
# Store embeddings in vector database
|
| 116 |
-
if chunks_with_metadata:
|
| 117 |
-
vector_repository.store_document_chunks(chunks_with_metadata)
|
| 118 |
-
|
| 119 |
-
# Store document metadata in SQL database
|
| 120 |
-
db_gen = get_db()
|
| 121 |
-
db = next(db_gen)
|
| 122 |
-
try:
|
| 123 |
-
content_repo = BookContentRepository(db)
|
| 124 |
-
|
| 125 |
-
for chunk_data in chunks_with_metadata:
|
| 126 |
-
# Create or update document in SQL database
|
| 127 |
-
existing_doc = content_repo.get_document_by_id(chunk_data['id'])
|
| 128 |
-
if not existing_doc:
|
| 129 |
-
content_repo.create_document(chunk_data)
|
| 130 |
-
finally:
|
| 131 |
-
next(db_gen, None) # Close the db session
|
| 132 |
-
|
| 133 |
-
return IngestResponse(
|
| 134 |
-
status="success",
|
| 135 |
-
message=f"Successfully ingested {len(valid_documents)} documents, "
|
| 136 |
-
f"created {len(all_chunks)} chunks, "
|
| 137 |
-
f"generated {len(chunks_with_metadata)} embeddings",
|
| 138 |
-
documents_processed=len(valid_documents),
|
| 139 |
-
chunks_created=len(all_chunks),
|
| 140 |
-
embeddings_generated=len(chunks_with_metadata),
|
| 141 |
-
timestamp=datetime.utcnow().isoformat()
|
| 142 |
-
)
|
| 143 |
-
|
| 144 |
-
except HTTPException:
|
| 145 |
-
raise
|
| 146 |
-
except Exception as e:
|
| 147 |
-
raise HTTPException(
|
| 148 |
-
status_code=500,
|
| 149 |
-
detail=f"Error during ingestion: {str(e)}"
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
@router.post("/ingest-from-content")
|
| 154 |
-
async def ingest_from_content(content: str = Form(...), title: str = Form(...), chapter: str = Form("Unknown"), section: str = Form("Unknown")):
|
| 155 |
-
"""
|
| 156 |
-
Ingest content directly from provided text
|
| 157 |
-
"""
|
| 158 |
-
try:
|
| 159 |
-
# Create a mock document from the provided content
|
| 160 |
-
document = {
|
| 161 |
-
'title': title,
|
| 162 |
-
'content': content,
|
| 163 |
-
'chapter': chapter,
|
| 164 |
-
'section': section,
|
| 165 |
-
'file_path': f"api_upload_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
|
| 166 |
-
'metadata': {},
|
| 167 |
-
'structure': []
|
| 168 |
-
}
|
| 169 |
-
|
| 170 |
-
# Validate document
|
| 171 |
-
if not document['content'].strip():
|
| 172 |
-
raise HTTPException(status_code=400, detail="Content cannot be empty")
|
| 173 |
-
|
| 174 |
-
# Chunk the document
|
| 175 |
-
all_chunks = chunk_documents([document])
|
| 176 |
-
|
| 177 |
-
if not all_chunks:
|
| 178 |
-
raise HTTPException(status_code=500, detail="No chunks were created from content")
|
| 179 |
-
|
| 180 |
-
# Generate embeddings for chunks
|
| 181 |
-
chunks_with_metadata = []
|
| 182 |
-
for chunk in all_chunks:
|
| 183 |
-
# Generate embedding using minimal embedding generator
|
| 184 |
-
embedding = minimal_embedding_generator.encode_query(chunk.content)
|
| 185 |
-
if embedding:
|
| 186 |
-
chunk_data = {
|
| 187 |
-
'id': chunk.id,
|
| 188 |
-
'content': chunk.content,
|
| 189 |
-
'title': chunk.title,
|
| 190 |
-
'chapter': chunk.chapter,
|
| 191 |
-
'section': chunk.section,
|
| 192 |
-
'page_reference': chunk.page_reference,
|
| 193 |
-
'token_count': chunk.token_count,
|
| 194 |
-
'embedding': embedding
|
| 195 |
-
}
|
| 196 |
-
chunks_with_metadata.append(chunk_data)
|
| 197 |
-
else:
|
| 198 |
-
# Skip chunks that couldn't generate embeddings
|
| 199 |
-
continue
|
| 200 |
-
|
| 201 |
-
# Store embeddings in vector database
|
| 202 |
-
if chunks_with_metadata:
|
| 203 |
-
vector_repository.store_document_chunks(chunks_with_metadata)
|
| 204 |
-
|
| 205 |
-
# Store document metadata in SQL database
|
| 206 |
-
db_gen = get_db()
|
| 207 |
-
db = next(db_gen)
|
| 208 |
-
try:
|
| 209 |
-
content_repo = BookContentRepository(db)
|
| 210 |
-
|
| 211 |
-
for chunk_data in chunks_with_metadata:
|
| 212 |
-
# Create or update document in SQL database
|
| 213 |
-
existing_doc = content_repo.get_document_by_id(chunk_data['id'])
|
| 214 |
-
if not existing_doc:
|
| 215 |
-
content_repo.create_document(chunk_data)
|
| 216 |
-
finally:
|
| 217 |
-
next(db_gen, None) # Close the db session
|
| 218 |
-
|
| 219 |
-
return {
|
| 220 |
-
"status": "success",
|
| 221 |
-
"message": f"Successfully ingested content, created {len(all_chunks)} chunks",
|
| 222 |
-
"chunks_created": len(all_chunks),
|
| 223 |
-
"timestamp": datetime.utcnow().isoformat()
|
| 224 |
-
}
|
| 225 |
-
|
| 226 |
-
except Exception as e:
|
| 227 |
-
raise HTTPException(
|
| 228 |
-
status_code=500,
|
| 229 |
-
detail=f"Error during content ingestion: {str(e)}"
|
| 230 |
-
)
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
@router.post("/ingest-file")
|
| 234 |
-
async def ingest_from_file(file: UploadFile = File(...), title: str = Form(None), chapter: str = Form("Unknown")):
|
| 235 |
-
"""
|
| 236 |
-
Ingest content from an uploaded file
|
| 237 |
-
"""
|
| 238 |
-
try:
|
| 239 |
-
# Read the uploaded file
|
| 240 |
-
content = await file.read()
|
| 241 |
-
content_str = content.decode('utf-8')
|
| 242 |
-
|
| 243 |
-
# Use filename as title if not provided
|
| 244 |
-
if not title:
|
| 245 |
-
title = Path(file.filename).stem
|
| 246 |
-
|
| 247 |
-
# Ingest the content
|
| 248 |
-
return await ingest_from_content(
|
| 249 |
-
content=content_str,
|
| 250 |
-
title=title,
|
| 251 |
-
chapter=chapter,
|
| 252 |
-
section=Path(file.filename).stem
|
| 253 |
-
)
|
| 254 |
-
|
| 255 |
-
except UnicodeDecodeError:
|
| 256 |
-
raise HTTPException(
|
| 257 |
-
status_code=400,
|
| 258 |
-
detail="File must be a UTF-8 encoded text file"
|
| 259 |
-
)
|
| 260 |
-
except Exception as e:
|
| 261 |
-
raise HTTPException(
|
| 262 |
-
status_code=500,
|
| 263 |
-
detail=f"Error processing uploaded file: {str(e)}"
|
| 264 |
-
)
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
@router.get("/ingest-status/{job_id}")
|
| 268 |
-
async def get_ingest_status(job_id: str):
|
| 269 |
-
"""
|
| 270 |
-
Get the status of an ingestion job
|
| 271 |
-
"""
|
| 272 |
-
if job_id not in ingestion_jobs:
|
| 273 |
-
raise HTTPException(status_code=404, detail="Job not found")
|
| 274 |
-
|
| 275 |
-
return IngestStatusResponse(
|
| 276 |
-
status=ingestion_jobs[job_id]["status"],
|
| 277 |
-
progress=ingestion_jobs[job_id]["progress"],
|
| 278 |
-
message=ingestion_jobs[job_id]["message"],
|
| 279 |
-
details=ingestion_jobs[job_id]["details"]
|
| 280 |
-
)
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
@router.get("/ingest-stats")
|
| 284 |
-
async def get_ingest_stats():
|
| 285 |
-
"""
|
| 286 |
-
Get ingestion statistics
|
| 287 |
-
"""
|
| 288 |
-
# Get vector store stats
|
| 289 |
-
vector_stats = vector_repository.get_collection_stats()
|
| 290 |
-
|
| 291 |
-
# Get database stats
|
| 292 |
-
db_gen = get_db()
|
| 293 |
-
db = next(db_gen)
|
| 294 |
-
try:
|
| 295 |
-
content_repo = BookContentRepository(db)
|
| 296 |
-
all_docs = content_repo.get_all_documents()
|
| 297 |
-
total_docs = len(all_docs)
|
| 298 |
-
|
| 299 |
-
# Group by chapter
|
| 300 |
-
chapters = {}
|
| 301 |
-
for doc in all_docs:
|
| 302 |
-
chapter = doc.chapter
|
| 303 |
-
if chapter not in chapters:
|
| 304 |
-
chapters[chapter] = 0
|
| 305 |
-
chapters[chapter] += 1
|
| 306 |
-
finally:
|
| 307 |
-
next(db_gen, None)
|
| 308 |
-
|
| 309 |
-
return {
|
| 310 |
-
"vector_store": vector_stats,
|
| 311 |
-
"database": {
|
| 312 |
-
"total_documents": total_docs,
|
| 313 |
-
"documents_by_chapter": chapters
|
| 314 |
-
},
|
| 315 |
-
"timestamp": datetime.utcnow().isoformat()
|
| 316 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/config.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
from pydantic_settings import BaseSettings
|
| 2 |
-
from typing import Optional
|
| 3 |
-
from functools import lru_cache
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
class Settings(BaseSettings):
|
| 7 |
-
# OpenRouter API
|
| 8 |
-
OPENROUTER_API_KEY: str
|
| 9 |
-
OPENROUTER_BASE_URL: str = "https://openrouter.ai/api/v1"
|
| 10 |
-
|
| 11 |
-
# Qdrant Vector Database
|
| 12 |
-
QDRANT_URL: str
|
| 13 |
-
QDRANT_API_KEY: str
|
| 14 |
-
QDRANT_CLUSTER_ID: str
|
| 15 |
-
|
| 16 |
-
# Neon PostgreSQL Database
|
| 17 |
-
NEON_DATABASE_URL: str
|
| 18 |
-
|
| 19 |
-
# Cohere API (if needed)
|
| 20 |
-
COHERE_API_KEY: Optional[str] = None
|
| 21 |
-
|
| 22 |
-
# Google Gemini API
|
| 23 |
-
GEMINI_API_KEY: Optional[str] = None
|
| 24 |
-
|
| 25 |
-
# Backend API
|
| 26 |
-
BACKEND_API_KEY: str
|
| 27 |
-
|
| 28 |
-
# Application settings
|
| 29 |
-
DEBUG: bool = False
|
| 30 |
-
LOG_LEVEL: str = "INFO"
|
| 31 |
-
MAX_CONTENT_LENGTH: int = 5000
|
| 32 |
-
RATE_LIMIT_REQUESTS: int = 100
|
| 33 |
-
RATE_LIMIT_WINDOW: int = 60 # in seconds
|
| 34 |
-
|
| 35 |
-
class Config:
|
| 36 |
-
env_file = ".env"
|
| 37 |
-
case_sensitive = False
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
@lru_cache()
|
| 41 |
-
def get_settings():
|
| 42 |
-
return Settings()
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
settings = get_settings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/database/__pycache__/database.cpython-312.pyc
DELETED
|
Binary file (1.65 kB)
|
|
|
app/database/__pycache__/models.cpython-312.pyc
DELETED
|
Binary file (5.82 kB)
|
|
|
app/database/__pycache__/repositories.cpython-312.pyc
DELETED
|
Binary file (10.8 kB)
|
|
|
app/database/database.py
DELETED
|
@@ -1,56 +0,0 @@
|
|
| 1 |
-
from sqlalchemy import create_engine
|
| 2 |
-
from sqlalchemy.ext.declarative import declarative_base
|
| 3 |
-
from sqlalchemy.orm import sessionmaker
|
| 4 |
-
from app.config import settings
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
# Create database engine with connection pooling
|
| 8 |
-
engine = create_engine(
|
| 9 |
-
settings.NEON_DATABASE_URL,
|
| 10 |
-
pool_pre_ping=True, # Verify connections before use
|
| 11 |
-
pool_recycle=300, # Recycle connections after 5 minutes
|
| 12 |
-
pool_size=10, # Number of connection to keep open
|
| 13 |
-
max_overflow=20, # Additional connections beyond pool_size
|
| 14 |
-
echo=False # Set to True for SQL query logging
|
| 15 |
-
)
|
| 16 |
-
|
| 17 |
-
# Create session factory
|
| 18 |
-
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 19 |
-
|
| 20 |
-
# Base class for models
|
| 21 |
-
Base = declarative_base()
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def get_db():
|
| 25 |
-
"""
|
| 26 |
-
Dependency for getting database session
|
| 27 |
-
"""
|
| 28 |
-
db = SessionLocal()
|
| 29 |
-
try:
|
| 30 |
-
yield db
|
| 31 |
-
finally:
|
| 32 |
-
db.close()
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def create_tables():
|
| 36 |
-
"""
|
| 37 |
-
Create all database tables based on models
|
| 38 |
-
This should be called during application startup
|
| 39 |
-
"""
|
| 40 |
-
from .models import Base
|
| 41 |
-
Base.metadata.create_all(bind=engine)
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
def get_engine():
|
| 45 |
-
"""
|
| 46 |
-
Return the database engine (useful for direct access)
|
| 47 |
-
"""
|
| 48 |
-
return engine
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
# Global database instance for easy access
|
| 52 |
-
db_instance = {
|
| 53 |
-
'engine': engine,
|
| 54 |
-
'SessionLocal': SessionLocal,
|
| 55 |
-
'Base': Base
|
| 56 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/database/models.py
DELETED
|
@@ -1,138 +0,0 @@
|
|
| 1 |
-
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, JSON, Index
|
| 2 |
-
from sqlalchemy.ext.declarative import declarative_base
|
| 3 |
-
from sqlalchemy.orm import sessionmaker
|
| 4 |
-
from datetime import datetime
|
| 5 |
-
import uuid
|
| 6 |
-
from app.config import settings
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
Base = declarative_base()
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
class BookContentDocument(Base):
|
| 13 |
-
"""
|
| 14 |
-
Model for storing book content document metadata
|
| 15 |
-
"""
|
| 16 |
-
__tablename__ = "book_content_documents"
|
| 17 |
-
|
| 18 |
-
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
| 19 |
-
document_id = Column(String, unique=True, nullable=False, index=True) # The ID from our chunking process
|
| 20 |
-
title = Column(String, nullable=False)
|
| 21 |
-
content = Column(Text, nullable=False) # Content summary or the actual content
|
| 22 |
-
chapter = Column(String, nullable=False)
|
| 23 |
-
section = Column(String, nullable=False)
|
| 24 |
-
page_reference = Column(String, nullable=True)
|
| 25 |
-
embedding_vector = Column(JSON, nullable=True) # Store as JSON for flexibility
|
| 26 |
-
token_count = Column(Integer, nullable=True)
|
| 27 |
-
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
| 28 |
-
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
| 29 |
-
|
| 30 |
-
# Indexes for better query performance
|
| 31 |
-
# Note: Indexes are handled separately to avoid creation conflicts
|
| 32 |
-
# __table_args__ = (
|
| 33 |
-
# Index('idx_chapter_section', 'chapter', 'section'),
|
| 34 |
-
# Index('idx_document_id', 'document_id'),
|
| 35 |
-
# )
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
class ChatSession(Base):
|
| 39 |
-
"""
|
| 40 |
-
Model for storing chat session information
|
| 41 |
-
"""
|
| 42 |
-
__tablename__ = "chat_sessions"
|
| 43 |
-
|
| 44 |
-
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
| 45 |
-
session_id = Column(String, unique=True, nullable=False, index=True)
|
| 46 |
-
user_id = Column(String, nullable=True) # Optional user identifier
|
| 47 |
-
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
| 48 |
-
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
| 49 |
-
session_metadata = Column(JSON, nullable=True) # Additional session metadata
|
| 50 |
-
|
| 51 |
-
# Note: Indexes are handled separately to avoid creation conflicts
|
| 52 |
-
# __table_args__ = (
|
| 53 |
-
# Index('idx_session_id', 'session_id'),
|
| 54 |
-
# Index('idx_user_id', 'user_id'),
|
| 55 |
-
# )
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
class ChatMessage(Base):
|
| 59 |
-
"""
|
| 60 |
-
Model for storing individual chat messages
|
| 61 |
-
"""
|
| 62 |
-
__tablename__ = "chat_messages"
|
| 63 |
-
|
| 64 |
-
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
| 65 |
-
message_id = Column(String, unique=True, nullable=False, index=True)
|
| 66 |
-
session_id = Column(String, nullable=False, index=True) # References chat_sessions.session_id
|
| 67 |
-
role = Column(String, nullable=False) # 'user' or 'assistant'
|
| 68 |
-
content = Column(Text, nullable=False)
|
| 69 |
-
citations = Column(JSON, nullable=True) # List of document IDs used in response
|
| 70 |
-
query_context_id = Column(String, nullable=True, index=True) # References query_context.context_id
|
| 71 |
-
timestamp = Column(DateTime, default=datetime.utcnow, nullable=False)
|
| 72 |
-
|
| 73 |
-
# Note: Indexes are handled separately to avoid creation conflicts
|
| 74 |
-
# __table_args__ = (
|
| 75 |
-
# Index('idx_session_id', 'session_id'),
|
| 76 |
-
# Index('idx_message_id', 'message_id'),
|
| 77 |
-
# Index('idx_query_context_id', 'query_context_id'),
|
| 78 |
-
# )
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
class QueryContext(Base):
|
| 82 |
-
"""
|
| 83 |
-
Model for storing query context information
|
| 84 |
-
"""
|
| 85 |
-
__tablename__ = "query_contexts"
|
| 86 |
-
|
| 87 |
-
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
| 88 |
-
context_id = Column(String, unique=True, nullable=False, index=True)
|
| 89 |
-
session_id = Column(String, nullable=False, index=True) # References chat_sessions.session_id
|
| 90 |
-
selected_text = Column(Text, nullable=True) # Text selected by user (for selection-based queries)
|
| 91 |
-
query_type = Column(String, nullable=False) # 'global' or 'selection'
|
| 92 |
-
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
| 93 |
-
|
| 94 |
-
# Note: Indexes are handled separately to avoid creation conflicts
|
| 95 |
-
# __table_args__ = (
|
| 96 |
-
# Index('idx_context_id', 'context_id'),
|
| 97 |
-
# Index('idx_session_id', 'session_id'),
|
| 98 |
-
# )
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
# Create engine and session
|
| 102 |
-
engine = create_engine(settings.NEON_DATABASE_URL, pool_pre_ping=True)
|
| 103 |
-
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
def drop_tables():
|
| 107 |
-
"""
|
| 108 |
-
Drop all database tables
|
| 109 |
-
"""
|
| 110 |
-
# Drop all tables with CASCADE to handle foreign key dependencies
|
| 111 |
-
# First drop any dependent objects, then drop tables
|
| 112 |
-
from sqlalchemy import text
|
| 113 |
-
with engine.connect() as conn:
|
| 114 |
-
# Drop all tables in the schema with CASCADE
|
| 115 |
-
conn.execute(text("DROP SCHEMA public CASCADE"))
|
| 116 |
-
conn.execute(text("CREATE SCHEMA public"))
|
| 117 |
-
conn.execute(text("GRANT ALL ON SCHEMA public TO public"))
|
| 118 |
-
conn.execute(text("GRANT ALL ON SCHEMA public TO neondb_owner"))
|
| 119 |
-
conn.commit()
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
def create_tables():
|
| 123 |
-
"""
|
| 124 |
-
Create all database tables
|
| 125 |
-
"""
|
| 126 |
-
# Create tables with checkfirst to avoid errors if they already exist
|
| 127 |
-
Base.metadata.create_all(bind=engine, checkfirst=True)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
def get_db():
|
| 131 |
-
"""
|
| 132 |
-
Dependency for getting database session
|
| 133 |
-
"""
|
| 134 |
-
db = SessionLocal()
|
| 135 |
-
try:
|
| 136 |
-
yield db
|
| 137 |
-
finally:
|
| 138 |
-
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/database/repositories.py
DELETED
|
@@ -1,217 +0,0 @@
|
|
| 1 |
-
from typing import List, Optional, Dict, Any
|
| 2 |
-
from sqlalchemy.orm import Session
|
| 3 |
-
from sqlalchemy import and_, or_
|
| 4 |
-
from app.database.models import BookContentDocument, ChatSession, ChatMessage, QueryContext
|
| 5 |
-
from app.ingestion.chunker import TextChunk
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class BookContentRepository:
|
| 9 |
-
"""
|
| 10 |
-
Repository for managing book content documents
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
def __init__(self, db: Session):
|
| 14 |
-
self.db = db
|
| 15 |
-
|
| 16 |
-
def create_document(self, chunk: Dict[str, Any]) -> BookContentDocument:
|
| 17 |
-
"""
|
| 18 |
-
Create a new book content document
|
| 19 |
-
"""
|
| 20 |
-
db_document = BookContentDocument(
|
| 21 |
-
document_id=chunk.get('id'),
|
| 22 |
-
title=chunk.get('title', ''),
|
| 23 |
-
content=chunk.get('content', ''),
|
| 24 |
-
chapter=chunk.get('chapter', ''),
|
| 25 |
-
section=chunk.get('section', ''),
|
| 26 |
-
page_reference=chunk.get('page_reference', ''),
|
| 27 |
-
embedding_vector=chunk.get('embedding'),
|
| 28 |
-
token_count=chunk.get('token_count', 0)
|
| 29 |
-
)
|
| 30 |
-
self.db.add(db_document)
|
| 31 |
-
self.db.commit()
|
| 32 |
-
self.db.refresh(db_document)
|
| 33 |
-
return db_document
|
| 34 |
-
|
| 35 |
-
def get_document_by_id(self, document_id: str) -> Optional[BookContentDocument]:
|
| 36 |
-
"""
|
| 37 |
-
Get a document by its ID
|
| 38 |
-
"""
|
| 39 |
-
return self.db.query(BookContentDocument).filter(
|
| 40 |
-
BookContentDocument.document_id == document_id
|
| 41 |
-
).first()
|
| 42 |
-
|
| 43 |
-
def get_documents_by_chapter(self, chapter: str) -> List[BookContentDocument]:
|
| 44 |
-
"""
|
| 45 |
-
Get all documents for a specific chapter
|
| 46 |
-
"""
|
| 47 |
-
return self.db.query(BookContentDocument).filter(
|
| 48 |
-
BookContentDocument.chapter == chapter
|
| 49 |
-
).all()
|
| 50 |
-
|
| 51 |
-
def get_all_documents(self) -> List[BookContentDocument]:
|
| 52 |
-
"""
|
| 53 |
-
Get all documents
|
| 54 |
-
"""
|
| 55 |
-
return self.db.query(BookContentDocument).all()
|
| 56 |
-
|
| 57 |
-
def update_document(self, document_id: str, **kwargs) -> Optional[BookContentDocument]:
|
| 58 |
-
"""
|
| 59 |
-
Update a document
|
| 60 |
-
"""
|
| 61 |
-
document = self.get_document_by_id(document_id)
|
| 62 |
-
if document:
|
| 63 |
-
for key, value in kwargs.items():
|
| 64 |
-
setattr(document, key, value)
|
| 65 |
-
self.db.commit()
|
| 66 |
-
self.db.refresh(document)
|
| 67 |
-
return document
|
| 68 |
-
|
| 69 |
-
def delete_document(self, document_id: str) -> bool:
|
| 70 |
-
"""
|
| 71 |
-
Delete a document
|
| 72 |
-
"""
|
| 73 |
-
document = self.get_document_by_id(document_id)
|
| 74 |
-
if document:
|
| 75 |
-
self.db.delete(document)
|
| 76 |
-
self.db.commit()
|
| 77 |
-
return True
|
| 78 |
-
return False
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
class ChatSessionRepository:
|
| 82 |
-
"""
|
| 83 |
-
Repository for managing chat sessions
|
| 84 |
-
"""
|
| 85 |
-
|
| 86 |
-
def __init__(self, db: Session):
|
| 87 |
-
self.db = db
|
| 88 |
-
|
| 89 |
-
def create_session(self, session_id: str, user_id: Optional[str] = None, metadata: Optional[Dict] = None) -> ChatSession:
|
| 90 |
-
"""
|
| 91 |
-
Create a new chat session
|
| 92 |
-
"""
|
| 93 |
-
db_session = ChatSession(
|
| 94 |
-
session_id=session_id,
|
| 95 |
-
user_id=user_id,
|
| 96 |
-
session_metadata=metadata
|
| 97 |
-
)
|
| 98 |
-
self.db.add(db_session)
|
| 99 |
-
self.db.commit()
|
| 100 |
-
self.db.refresh(db_session)
|
| 101 |
-
return db_session
|
| 102 |
-
|
| 103 |
-
def get_session_by_id(self, session_id: str) -> Optional[ChatSession]:
|
| 104 |
-
"""
|
| 105 |
-
Get a session by its ID
|
| 106 |
-
"""
|
| 107 |
-
return self.db.query(ChatSession).filter(
|
| 108 |
-
ChatSession.session_id == session_id
|
| 109 |
-
).first()
|
| 110 |
-
|
| 111 |
-
def update_session(self, session_id: str, **kwargs) -> Optional[ChatSession]:
|
| 112 |
-
"""
|
| 113 |
-
Update a session
|
| 114 |
-
"""
|
| 115 |
-
session = self.get_session_by_id(session_id)
|
| 116 |
-
if session:
|
| 117 |
-
for key, value in kwargs.items():
|
| 118 |
-
setattr(session, key, value)
|
| 119 |
-
self.db.commit()
|
| 120 |
-
self.db.refresh(session)
|
| 121 |
-
return session
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
class ChatMessageRepository:
|
| 125 |
-
"""
|
| 126 |
-
Repository for managing chat messages
|
| 127 |
-
"""
|
| 128 |
-
|
| 129 |
-
def __init__(self, db: Session):
|
| 130 |
-
self.db = db
|
| 131 |
-
|
| 132 |
-
def create_message(
|
| 133 |
-
self,
|
| 134 |
-
message_id: str,
|
| 135 |
-
session_id: str,
|
| 136 |
-
role: str,
|
| 137 |
-
content: str,
|
| 138 |
-
citations: Optional[List[Dict]] = None,
|
| 139 |
-
query_context_id: Optional[str] = None
|
| 140 |
-
) -> ChatMessage:
|
| 141 |
-
"""
|
| 142 |
-
Create a new chat message
|
| 143 |
-
"""
|
| 144 |
-
db_message = ChatMessage(
|
| 145 |
-
message_id=message_id,
|
| 146 |
-
session_id=session_id,
|
| 147 |
-
role=role,
|
| 148 |
-
content=content,
|
| 149 |
-
citations=citations,
|
| 150 |
-
query_context_id=query_context_id
|
| 151 |
-
)
|
| 152 |
-
self.db.add(db_message)
|
| 153 |
-
self.db.commit()
|
| 154 |
-
self.db.refresh(db_message)
|
| 155 |
-
return db_message
|
| 156 |
-
|
| 157 |
-
def get_messages_by_session(self, session_id: str) -> List[ChatMessage]:
|
| 158 |
-
"""
|
| 159 |
-
Get all messages for a session
|
| 160 |
-
"""
|
| 161 |
-
return self.db.query(ChatMessage).filter(
|
| 162 |
-
ChatMessage.session_id == session_id
|
| 163 |
-
).order_by(ChatMessage.timestamp).all()
|
| 164 |
-
|
| 165 |
-
def get_message_by_id(self, message_id: str) -> Optional[ChatMessage]:
|
| 166 |
-
"""
|
| 167 |
-
Get a message by its ID
|
| 168 |
-
"""
|
| 169 |
-
return self.db.query(ChatMessage).filter(
|
| 170 |
-
ChatMessage.message_id == message_id
|
| 171 |
-
).first()
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
class QueryContextRepository:
|
| 175 |
-
"""
|
| 176 |
-
Repository for managing query contexts
|
| 177 |
-
"""
|
| 178 |
-
|
| 179 |
-
def __init__(self, db: Session):
|
| 180 |
-
self.db = db
|
| 181 |
-
|
| 182 |
-
def create_query_context(
|
| 183 |
-
self,
|
| 184 |
-
context_id: str,
|
| 185 |
-
session_id: str,
|
| 186 |
-
selected_text: Optional[str] = None,
|
| 187 |
-
query_type: str = "global"
|
| 188 |
-
) -> QueryContext:
|
| 189 |
-
"""
|
| 190 |
-
Create a new query context
|
| 191 |
-
"""
|
| 192 |
-
db_context = QueryContext(
|
| 193 |
-
context_id=context_id,
|
| 194 |
-
session_id=session_id,
|
| 195 |
-
selected_text=selected_text,
|
| 196 |
-
query_type=query_type
|
| 197 |
-
)
|
| 198 |
-
self.db.add(db_context)
|
| 199 |
-
self.db.commit()
|
| 200 |
-
self.db.refresh(db_context)
|
| 201 |
-
return db_context
|
| 202 |
-
|
| 203 |
-
def get_context_by_id(self, context_id: str) -> Optional[QueryContext]:
|
| 204 |
-
"""
|
| 205 |
-
Get a query context by its ID
|
| 206 |
-
"""
|
| 207 |
-
return self.db.query(QueryContext).filter(
|
| 208 |
-
QueryContext.context_id == context_id
|
| 209 |
-
).first()
|
| 210 |
-
|
| 211 |
-
def get_contexts_by_session(self, session_id: str) -> List[QueryContext]:
|
| 212 |
-
"""
|
| 213 |
-
Get all query contexts for a session
|
| 214 |
-
"""
|
| 215 |
-
return self.db.query(QueryContext).filter(
|
| 216 |
-
QueryContext.session_id == session_id
|
| 217 |
-
).all()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/embeddings/__pycache__/minimal_embedding_generator.cpython-312.pyc
DELETED
|
Binary file (3.65 kB)
|
|
|
app/embeddings/minimal_embedding_generator.py
DELETED
|
@@ -1,79 +0,0 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
from typing import List
|
| 3 |
-
import hashlib
|
| 4 |
-
import re
|
| 5 |
-
from collections import Counter
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class MinimalEmbeddingGenerator:
|
| 9 |
-
"""
|
| 10 |
-
Minimal embedding generator using simple hashing and basic statistics
|
| 11 |
-
This avoids heavy dependencies and memory issues
|
| 12 |
-
"""
|
| 13 |
-
|
| 14 |
-
def __init__(self):
|
| 15 |
-
self.vocab = set()
|
| 16 |
-
self.fitted = False
|
| 17 |
-
|
| 18 |
-
def _pad_embedding(self, embedding: List[float], target_size: int = 1536) -> List[float]:
|
| 19 |
-
"""
|
| 20 |
-
Pad embedding to target size with zeros
|
| 21 |
-
"""
|
| 22 |
-
current_size = len(embedding)
|
| 23 |
-
if current_size >= target_size:
|
| 24 |
-
return embedding[:target_size]
|
| 25 |
-
else:
|
| 26 |
-
padded = [0.0] * target_size
|
| 27 |
-
padded[:current_size] = embedding
|
| 28 |
-
return padded
|
| 29 |
-
|
| 30 |
-
def _text_to_vector(self, text: str) -> List[float]:
|
| 31 |
-
"""
|
| 32 |
-
Convert text to a vector using simple hashing approach
|
| 33 |
-
"""
|
| 34 |
-
# Clean text
|
| 35 |
-
text = re.sub(r'[^\w\s]', ' ', text.lower())
|
| 36 |
-
words = text.split()
|
| 37 |
-
|
| 38 |
-
if not words:
|
| 39 |
-
return [0.0] * 1536
|
| 40 |
-
|
| 41 |
-
# Create a simple vector based on word hashes
|
| 42 |
-
vector = [0.0] * 1536
|
| 43 |
-
|
| 44 |
-
for word in words:
|
| 45 |
-
# Use hash to determine position in vector
|
| 46 |
-
hash_val = int(hashlib.md5(word.encode()).hexdigest(), 16)
|
| 47 |
-
pos = hash_val % 1536
|
| 48 |
-
# Add to vector with some normalization
|
| 49 |
-
vector[pos] += 1.0 / len(words) # Normalize by document length
|
| 50 |
-
|
| 51 |
-
# Normalize the vector
|
| 52 |
-
norm = sum(x**2 for x in vector) ** 0.5
|
| 53 |
-
if norm > 0:
|
| 54 |
-
vector = [x / norm for x in vector]
|
| 55 |
-
|
| 56 |
-
return vector
|
| 57 |
-
|
| 58 |
-
def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
|
| 59 |
-
"""
|
| 60 |
-
Generate embeddings for a list of texts using simple hashing
|
| 61 |
-
"""
|
| 62 |
-
embeddings = []
|
| 63 |
-
for text in texts:
|
| 64 |
-
embedding = self._text_to_vector(text)
|
| 65 |
-
padded_embedding = self._pad_embedding(embedding)
|
| 66 |
-
embeddings.append(padded_embedding)
|
| 67 |
-
|
| 68 |
-
return embeddings
|
| 69 |
-
|
| 70 |
-
def encode_query(self, query: str) -> List[float]:
|
| 71 |
-
"""
|
| 72 |
-
Encode a single query for similarity search
|
| 73 |
-
"""
|
| 74 |
-
embedding = self._text_to_vector(query)
|
| 75 |
-
return self._pad_embedding(embedding)
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
# Global instance
|
| 79 |
-
minimal_embedding_generator = MinimalEmbeddingGenerator()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/generation/__pycache__/response_generator.cpython-312.pyc
DELETED
|
Binary file (14.7 kB)
|
|
|
app/generation/response_generator.py
DELETED
|
@@ -1,387 +0,0 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
-
import json
|
| 3 |
-
from typing import List, Dict, Any, Optional
|
| 4 |
-
from app.services.openrouter_client import openrouter_client
|
| 5 |
-
from app.prompting.prompt_builder import PromptBuilder
|
| 6 |
-
from app.retrieval.retriever import Retriever
|
| 7 |
-
from app.config import settings
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class ResponseGenerator:
|
| 11 |
-
"""
|
| 12 |
-
Generates responses using Google Gemini with proper context and citation tracking
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
def __init__(self):
|
| 16 |
-
self.openrouter_client = openrouter_client
|
| 17 |
-
self.prompt_builder = PromptBuilder()
|
| 18 |
-
self.retriever = Retriever()
|
| 19 |
-
|
| 20 |
-
async def generate_response(
|
| 21 |
-
self,
|
| 22 |
-
query: str,
|
| 23 |
-
retrieved_contexts: List[Dict[str, Any]],
|
| 24 |
-
query_type: str = "global",
|
| 25 |
-
selected_text: Optional[str] = None,
|
| 26 |
-
session_id: Optional[str] = None
|
| 27 |
-
) -> Dict[str, Any]:
|
| 28 |
-
"""
|
| 29 |
-
Generate a response to a query using Google Gemini
|
| 30 |
-
"""
|
| 31 |
-
try:
|
| 32 |
-
# Build context string from retrieved contexts
|
| 33 |
-
context_parts = []
|
| 34 |
-
for ctx in retrieved_contexts:
|
| 35 |
-
context_text = ctx.get('content', '')
|
| 36 |
-
if context_text:
|
| 37 |
-
context_parts.append(context_text)
|
| 38 |
-
|
| 39 |
-
context_string = "\n\n".join(context_parts)
|
| 40 |
-
|
| 41 |
-
# Prepare messages for OpenRouter
|
| 42 |
-
messages = [
|
| 43 |
-
{
|
| 44 |
-
"role": "system",
|
| 45 |
-
"content": "You are an expert assistant for the Physical AI & Humanoid Robotics curriculum. Provide helpful, conversational responses based on the provided context. Always use information only from the provided context and be factual."
|
| 46 |
-
},
|
| 47 |
-
{
|
| 48 |
-
"role": "user",
|
| 49 |
-
"content": f"Context: {context_string}\n\nQuestion: {query}\n\nProvide a helpful response based on the context:"
|
| 50 |
-
}
|
| 51 |
-
]
|
| 52 |
-
|
| 53 |
-
# Generate response using OpenRouter
|
| 54 |
-
openrouter_response = await self.openrouter_client.generate_completion(
|
| 55 |
-
messages=messages,
|
| 56 |
-
model="mistralai/devstral-2512:free",
|
| 57 |
-
temperature=0.3,
|
| 58 |
-
max_tokens=1000
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
-
if not openrouter_response:
|
| 62 |
-
return {
|
| 63 |
-
"response": "I encountered an issue generating a response. Please try again.",
|
| 64 |
-
"citations": [],
|
| 65 |
-
"query_type": query_type,
|
| 66 |
-
"session_id": session_id
|
| 67 |
-
}
|
| 68 |
-
|
| 69 |
-
# Extract citations from contexts used
|
| 70 |
-
citations = self._extract_citations(retrieved_contexts)
|
| 71 |
-
|
| 72 |
-
return {
|
| 73 |
-
"response": openrouter_response,
|
| 74 |
-
"citations": citations,
|
| 75 |
-
"query_type": query_type,
|
| 76 |
-
"session_id": session_id
|
| 77 |
-
}
|
| 78 |
-
|
| 79 |
-
except Exception as e:
|
| 80 |
-
print(f"[ERROR] Error generating response: {str(e)}")
|
| 81 |
-
# If AI service fails, try to provide a helpful response based on the context
|
| 82 |
-
if retrieved_contexts:
|
| 83 |
-
# Extract key information from contexts to provide a basic response
|
| 84 |
-
context_titles = [ctx.get('title', '') for ctx in retrieved_contexts if ctx.get('title')]
|
| 85 |
-
unique_titles = list(set(context_titles))
|
| 86 |
-
|
| 87 |
-
if unique_titles:
|
| 88 |
-
response = f"Based on the Physical AI & Humanoid Robotics curriculum, I found information related to: {', '.join(unique_titles[:3])}. "
|
| 89 |
-
response += "Here's what I can share from the book content: "
|
| 90 |
-
# Include some of the actual content from the contexts
|
| 91 |
-
first_context = retrieved_contexts[0]
|
| 92 |
-
content_preview = first_context.get('content', '')[:200]
|
| 93 |
-
response += content_preview + ("..." if len(first_context.get('content', '')) > 200 else "")
|
| 94 |
-
else:
|
| 95 |
-
response = "Based on the Physical AI & Humanoid Robotics curriculum, I found relevant content. "
|
| 96 |
-
response += "Could you ask a more specific question about the topic you're interested in?"
|
| 97 |
-
else:
|
| 98 |
-
response = "I couldn't find relevant information in the Physical AI & Humanoid Robotics curriculum to answer your question. "
|
| 99 |
-
response += "Please try asking about specific topics from the curriculum like ROS 2, Digital Twins, AI-Brain, or VLA."
|
| 100 |
-
|
| 101 |
-
return {
|
| 102 |
-
"response": response,
|
| 103 |
-
"citations": self._extract_citations(retrieved_contexts),
|
| 104 |
-
"query_type": query_type,
|
| 105 |
-
"session_id": session_id,
|
| 106 |
-
"error": str(e)
|
| 107 |
-
}
|
| 108 |
-
|
| 109 |
-
async def generate_response_with_validation(
|
| 110 |
-
self,
|
| 111 |
-
query: str,
|
| 112 |
-
retrieved_contexts: List[Dict[str, Any]],
|
| 113 |
-
query_type: str = "global",
|
| 114 |
-
selected_text: Optional[str] = None,
|
| 115 |
-
session_id: Optional[str] = None
|
| 116 |
-
) -> Dict[str, Any]:
|
| 117 |
-
"""
|
| 118 |
-
Generate response with additional validation to ensure it's based only on provided context
|
| 119 |
-
"""
|
| 120 |
-
try:
|
| 121 |
-
# First, generate the initial response
|
| 122 |
-
result = await self.generate_response(
|
| 123 |
-
query=query,
|
| 124 |
-
retrieved_contexts=retrieved_contexts,
|
| 125 |
-
query_type=query_type,
|
| 126 |
-
selected_text=selected_text,
|
| 127 |
-
session_id=session_id
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
-
# Validate that the response is grounded in the provided context
|
| 131 |
-
# But be more lenient for high-level book/module questions
|
| 132 |
-
if retrieved_contexts and "The provided context does not contain" not in result.get("response", ""):
|
| 133 |
-
is_valid = self._validate_response_uses_context(
|
| 134 |
-
response=result["response"],
|
| 135 |
-
contexts=retrieved_contexts
|
| 136 |
-
)
|
| 137 |
-
|
| 138 |
-
# For high-level book/module questions, be more lenient with validation
|
| 139 |
-
query_lower = query.lower()
|
| 140 |
-
is_book_overview_query = any(phrase in query_lower for phrase in [
|
| 141 |
-
'what is', 'tell me about', 'describe', 'overview', 'introduction',
|
| 142 |
-
'physical ai', 'humanoid robotics', 'book', 'curriculum', 'module',
|
| 143 |
-
'quick start', 'setup', 'getting started', 'chapter', 'section'
|
| 144 |
-
])
|
| 145 |
-
|
| 146 |
-
# Only retry with stronger guidance if it's not a book overview query and validation fails
|
| 147 |
-
if not is_valid and not is_book_overview_query and query_type != "selection":
|
| 148 |
-
# If response doesn't seem to use context, try again with stronger instructions
|
| 149 |
-
result = await self._generate_with_stronger_context_guidance(
|
| 150 |
-
query=query,
|
| 151 |
-
retrieved_contexts=retrieved_contexts,
|
| 152 |
-
query_type=query_type,
|
| 153 |
-
selected_text=selected_text,
|
| 154 |
-
session_id=session_id
|
| 155 |
-
)
|
| 156 |
-
|
| 157 |
-
return result
|
| 158 |
-
except Exception as e:
|
| 159 |
-
print(f"[ERROR] Error in generate_response_with_validation: {str(e)}")
|
| 160 |
-
# If AI service fails, try to provide a helpful response based on the context
|
| 161 |
-
if retrieved_contexts:
|
| 162 |
-
# Extract key information from contexts to provide a basic response
|
| 163 |
-
context_titles = [ctx.get('title', '') for ctx in retrieved_contexts if ctx.get('title')]
|
| 164 |
-
unique_titles = list(set(context_titles))
|
| 165 |
-
|
| 166 |
-
if unique_titles:
|
| 167 |
-
response = f"Based on the Physical AI & Humanoid Robotics curriculum, I found information related to: {', '.join(unique_titles[:3])}. "
|
| 168 |
-
response += "Here's what I can share from the book content: "
|
| 169 |
-
# Include some of the actual content from the contexts
|
| 170 |
-
first_context = retrieved_contexts[0]
|
| 171 |
-
content_preview = first_context.get('content', '')[:200]
|
| 172 |
-
response += content_preview + ("..." if len(first_context.get('content', '')) > 200 else "")
|
| 173 |
-
else:
|
| 174 |
-
response = "Based on the Physical AI & Humanoid Robotics curriculum, I found relevant content. "
|
| 175 |
-
response += "Could you ask a more specific question about the topic you're interested in?"
|
| 176 |
-
else:
|
| 177 |
-
response = "I couldn't find relevant information in the Physical AI & Humanoid Robotics curriculum to answer your question. "
|
| 178 |
-
response += "Please try asking about specific topics from the curriculum like ROS 2, Digital Twins, AI-Brain, or VLA."
|
| 179 |
-
|
| 180 |
-
return {
|
| 181 |
-
"response": response,
|
| 182 |
-
"citations": self._extract_citations(retrieved_contexts),
|
| 183 |
-
"query_type": query_type,
|
| 184 |
-
"session_id": session_id,
|
| 185 |
-
"error": str(e)
|
| 186 |
-
}
|
| 187 |
-
|
| 188 |
-
async def _generate_with_stronger_context_guidance(
|
| 189 |
-
self,
|
| 190 |
-
query: str,
|
| 191 |
-
retrieved_contexts: List[Dict[str, Any]],
|
| 192 |
-
query_type: str,
|
| 193 |
-
selected_text: Optional[str] = None,
|
| 194 |
-
session_id: Optional[str] = None
|
| 195 |
-
) -> Dict[str, Any]:
|
| 196 |
-
"""
|
| 197 |
-
Generate response with stronger instructions to use only provided context
|
| 198 |
-
"""
|
| 199 |
-
# Build context string from retrieved contexts
|
| 200 |
-
context_parts = []
|
| 201 |
-
for ctx in retrieved_contexts:
|
| 202 |
-
context_text = ctx.get('content', '')
|
| 203 |
-
if context_text:
|
| 204 |
-
context_parts.append(context_text)
|
| 205 |
-
|
| 206 |
-
context_string = "\n\n".join(context_parts)
|
| 207 |
-
|
| 208 |
-
# Add stronger instructions to the context
|
| 209 |
-
stronger_context = (
|
| 210 |
-
context_string +
|
| 211 |
-
"\n\nIMPORTANT: The response MUST be based ONLY on the provided context above. "
|
| 212 |
-
"Do not use any external knowledge or make up information. "
|
| 213 |
-
"If the context does not contain the answer, explicitly state this fact."
|
| 214 |
-
)
|
| 215 |
-
|
| 216 |
-
# Prepare messages for OpenRouter
|
| 217 |
-
messages = [
|
| 218 |
-
{
|
| 219 |
-
"role": "system",
|
| 220 |
-
"content": "You are an expert assistant for the Physical AI & Humanoid Robotics curriculum. Provide helpful, conversational responses based on the provided context. Always use information only from the provided context and be factual."
|
| 221 |
-
},
|
| 222 |
-
{
|
| 223 |
-
"role": "user",
|
| 224 |
-
"content": f"Context: {stronger_context}\n\nQuestion: {query}\n\nProvide a helpful response based on the context:"
|
| 225 |
-
}
|
| 226 |
-
]
|
| 227 |
-
|
| 228 |
-
# Generate response using OpenRouter
|
| 229 |
-
openrouter_response = await self.openrouter_client.generate_completion(
|
| 230 |
-
messages=messages,
|
| 231 |
-
model="mistralai/devstral-2512:free",
|
| 232 |
-
temperature=0.3,
|
| 233 |
-
max_tokens=1000
|
| 234 |
-
)
|
| 235 |
-
|
| 236 |
-
if not openrouter_response:
|
| 237 |
-
return {
|
| 238 |
-
"response": "I encountered an issue generating a response. Please try again.",
|
| 239 |
-
"citations": [],
|
| 240 |
-
"query_type": query_type,
|
| 241 |
-
"session_id": session_id
|
| 242 |
-
}
|
| 243 |
-
|
| 244 |
-
citations = self._extract_citations(retrieved_contexts)
|
| 245 |
-
|
| 246 |
-
return {
|
| 247 |
-
"response": openrouter_response,
|
| 248 |
-
"citations": citations,
|
| 249 |
-
"query_type": query_type,
|
| 250 |
-
"session_id": session_id
|
| 251 |
-
}
|
| 252 |
-
|
| 253 |
-
def _extract_citations(self, contexts: List[Dict[str, Any]]) -> List[Dict[str, str]]:
|
| 254 |
-
"""
|
| 255 |
-
Extract citation information from retrieved contexts
|
| 256 |
-
"""
|
| 257 |
-
citations = []
|
| 258 |
-
for ctx in contexts:
|
| 259 |
-
citation = {
|
| 260 |
-
"document_id": ctx.get('id', ''),
|
| 261 |
-
"title": ctx.get('title', ''),
|
| 262 |
-
"chapter": ctx.get('chapter', ''),
|
| 263 |
-
"section": ctx.get('section', ''),
|
| 264 |
-
"page_reference": ctx.get('page_reference', '')
|
| 265 |
-
}
|
| 266 |
-
citations.append(citation)
|
| 267 |
-
return citations
|
| 268 |
-
|
| 269 |
-
def _validate_response_uses_context(
|
| 270 |
-
self,
|
| 271 |
-
response: str,
|
| 272 |
-
contexts: List[Dict[str, Any]]
|
| 273 |
-
) -> bool:
|
| 274 |
-
"""
|
| 275 |
-
Validate if response uses the provided context, allowing for summarization and synthesis
|
| 276 |
-
"""
|
| 277 |
-
if not contexts:
|
| 278 |
-
return False
|
| 279 |
-
|
| 280 |
-
response_lower = response.lower()
|
| 281 |
-
|
| 282 |
-
# Check for semantic relevance rather than exact keyword matching
|
| 283 |
-
# Look for broader topic matches and concepts
|
| 284 |
-
context_text = " ".join([ctx.get('content', '') for ctx in contexts if ctx.get('content')])
|
| 285 |
-
context_lower = context_text.lower()
|
| 286 |
-
|
| 287 |
-
# Check for relevant topics and concepts that indicate proper grounding
|
| 288 |
-
book_related_terms = [
|
| 289 |
-
'physical ai', 'humanoid', 'robotics', 'module', 'ros', 'digital twin',
|
| 290 |
-
'ai-brain', 'vla', 'curriculum', 'course', 'book', 'chapter', 'section',
|
| 291 |
-
'setup', 'quickstart', 'introduction', 'learning', 'education'
|
| 292 |
-
]
|
| 293 |
-
|
| 294 |
-
# Check if response mentions book-related concepts that are in context
|
| 295 |
-
response_has_relevant_terms = any(term in response_lower for term in book_related_terms)
|
| 296 |
-
context_has_relevant_terms = any(term in context_lower for term in book_related_terms)
|
| 297 |
-
|
| 298 |
-
if response_has_relevant_terms and context_has_relevant_terms:
|
| 299 |
-
return True
|
| 300 |
-
|
| 301 |
-
# Also check for title and chapter references
|
| 302 |
-
context_titles = " ".join([ctx.get('title', '') + ' ' + ctx.get('chapter', '') + ' ' + ctx.get('section', '')
|
| 303 |
-
for ctx in contexts if ctx.get('title') or ctx.get('chapter') or ctx.get('section')])
|
| 304 |
-
context_titles_lower = context_titles.lower()
|
| 305 |
-
|
| 306 |
-
# If response mentions specific titles/chapters that exist in context, it's likely valid
|
| 307 |
-
if any(title_term in response_lower for title_term in context_titles_lower.split() if len(title_term) > 3):
|
| 308 |
-
return True
|
| 309 |
-
|
| 310 |
-
# Fallback: check if there's general overlap in concepts
|
| 311 |
-
response_words = set(response_lower.split())
|
| 312 |
-
context_words = set(context_lower.split())
|
| 313 |
-
|
| 314 |
-
# Find intersection of meaningful words (longer than 3 chars, not common words)
|
| 315 |
-
common_words = response_words.intersection(context_words)
|
| 316 |
-
meaningful_common_words = [w for w in common_words if len(w) > 3 and w not in ['the', 'and', 'for', 'are', 'but', 'not', 'you', 'have', 'with', 'this', 'that', 'from']]
|
| 317 |
-
|
| 318 |
-
# If there are meaningful overlapping words, consider it valid
|
| 319 |
-
return len(meaningful_common_words) >= 2
|
| 320 |
-
|
| 321 |
-
async def generate_citation_aware_response(
|
| 322 |
-
self,
|
| 323 |
-
query: str,
|
| 324 |
-
retrieved_contexts: List[Dict[str, Any]],
|
| 325 |
-
query_type: str = "global",
|
| 326 |
-
selected_text: Optional[str] = None,
|
| 327 |
-
session_id: Optional[str] = None
|
| 328 |
-
) -> Dict[str, Any]:
|
| 329 |
-
"""
|
| 330 |
-
Generate a response that explicitly mentions which sources were used
|
| 331 |
-
"""
|
| 332 |
-
# Build context string from retrieved contexts
|
| 333 |
-
context_parts = []
|
| 334 |
-
for ctx in retrieved_contexts:
|
| 335 |
-
context_text = ctx.get('content', '')
|
| 336 |
-
if context_text:
|
| 337 |
-
context_parts.append(context_text)
|
| 338 |
-
|
| 339 |
-
context_string = "\n\n".join(context_parts)
|
| 340 |
-
|
| 341 |
-
# Add specific instruction about citations to the context
|
| 342 |
-
citation_context = (
|
| 343 |
-
context_string +
|
| 344 |
-
"\n\nWhen answering, please indicate which sources you used by referencing them as "
|
| 345 |
-
"[Source 1], [Source 2], etc., corresponding to the order they appear in the context section."
|
| 346 |
-
)
|
| 347 |
-
|
| 348 |
-
# Prepare messages for OpenRouter
|
| 349 |
-
messages = [
|
| 350 |
-
{
|
| 351 |
-
"role": "system",
|
| 352 |
-
"content": "You are an expert assistant for the Physical AI & Humanoid Robotics curriculum. Provide helpful, conversational responses based on the provided context. Always use information only from the provided context and be factual. When answering, please indicate which sources you used by referencing them as [Source 1], [Source 2], etc., corresponding to the order they appear in the context section."
|
| 353 |
-
},
|
| 354 |
-
{
|
| 355 |
-
"role": "user",
|
| 356 |
-
"content": f"Context: {citation_context}\n\nQuestion: {query}\n\nProvide a helpful response based on the context:"
|
| 357 |
-
}
|
| 358 |
-
]
|
| 359 |
-
|
| 360 |
-
# Generate response using OpenRouter
|
| 361 |
-
openrouter_response = await self.openrouter_client.generate_completion(
|
| 362 |
-
messages=messages,
|
| 363 |
-
model="mistralai/devstral-2512:free",
|
| 364 |
-
temperature=0.3,
|
| 365 |
-
max_tokens=1000
|
| 366 |
-
)
|
| 367 |
-
|
| 368 |
-
if not openrouter_response:
|
| 369 |
-
return {
|
| 370 |
-
"response": "I encountered an issue generating a response. Please try again.",
|
| 371 |
-
"citations": [],
|
| 372 |
-
"query_type": query_type,
|
| 373 |
-
"session_id": session_id
|
| 374 |
-
}
|
| 375 |
-
|
| 376 |
-
citations = self._extract_citations(retrieved_contexts)
|
| 377 |
-
|
| 378 |
-
return {
|
| 379 |
-
"response": openrouter_response,
|
| 380 |
-
"citations": citations,
|
| 381 |
-
"query_type": query_type,
|
| 382 |
-
"session_id": session_id
|
| 383 |
-
}
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
# Global instance
|
| 387 |
-
response_generator = ResponseGenerator()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/ingestion/__pycache__/chunker.cpython-312.pyc
DELETED
|
Binary file (10.1 kB)
|
|
|
app/ingestion/__pycache__/document_parser.cpython-312.pyc
DELETED
|
Binary file (6.08 kB)
|
|
|
app/ingestion/__pycache__/file_scanner.cpython-312.pyc
DELETED
|
Binary file (4.87 kB)
|
|
|
app/ingestion/chunker.py
DELETED
|
@@ -1,291 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
import uuid
|
| 3 |
-
from typing import List, Dict, Tuple
|
| 4 |
-
from dataclasses import dataclass
|
| 5 |
-
from .document_parser import DocumentParser
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
@dataclass
|
| 9 |
-
class TextChunk:
|
| 10 |
-
"""Represents a chunk of text with metadata"""
|
| 11 |
-
id: str
|
| 12 |
-
content: str
|
| 13 |
-
title: str
|
| 14 |
-
chapter: str
|
| 15 |
-
section: str
|
| 16 |
-
page_reference: str
|
| 17 |
-
token_count: int
|
| 18 |
-
original_start_pos: int
|
| 19 |
-
original_end_pos: int
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
class TextChunker:
|
| 23 |
-
"""
|
| 24 |
-
Implements heading-aware text chunking to maintain semantic boundaries
|
| 25 |
-
"""
|
| 26 |
-
|
| 27 |
-
def __init__(self, max_tokens: int = 800, min_tokens: int = 100, overlap_ratio: float = 0.1):
|
| 28 |
-
self.max_tokens = max_tokens
|
| 29 |
-
self.min_tokens = min_tokens
|
| 30 |
-
self.overlap_ratio = overlap_ratio
|
| 31 |
-
self.parser = DocumentParser()
|
| 32 |
-
|
| 33 |
-
def chunk_document(self, document: Dict) -> List[TextChunk]:
|
| 34 |
-
"""
|
| 35 |
-
Chunk a document while preserving semantic boundaries
|
| 36 |
-
"""
|
| 37 |
-
content = document['content']
|
| 38 |
-
title = document['title']
|
| 39 |
-
chapter = document['chapter']
|
| 40 |
-
section = document['section']
|
| 41 |
-
file_path = document['file_path']
|
| 42 |
-
|
| 43 |
-
# First, identify structural boundaries (headings)
|
| 44 |
-
structure = self._identify_structure(content)
|
| 45 |
-
|
| 46 |
-
if not structure:
|
| 47 |
-
# If no headings found, chunk by max token size
|
| 48 |
-
return self._chunk_by_size(content, title, chapter, section, file_path)
|
| 49 |
-
|
| 50 |
-
# Split content by structural boundaries
|
| 51 |
-
structured_chunks = self._split_by_structure(content, structure)
|
| 52 |
-
|
| 53 |
-
# Further chunk large structural sections if needed
|
| 54 |
-
final_chunks = []
|
| 55 |
-
for i, (start_pos, end_pos, heading_context) in enumerate(structured_chunks):
|
| 56 |
-
chunk_content = content[start_pos:end_pos]
|
| 57 |
-
|
| 58 |
-
if self._count_tokens(chunk_content) > self.max_tokens:
|
| 59 |
-
# Split large sections while preserving heading context
|
| 60 |
-
sub_chunks = self._chunk_large_section(
|
| 61 |
-
chunk_content,
|
| 62 |
-
title,
|
| 63 |
-
chapter,
|
| 64 |
-
section,
|
| 65 |
-
file_path,
|
| 66 |
-
heading_context,
|
| 67 |
-
start_pos
|
| 68 |
-
)
|
| 69 |
-
final_chunks.extend(sub_chunks)
|
| 70 |
-
else:
|
| 71 |
-
# Create a single chunk for this section
|
| 72 |
-
chunk_id = str(uuid.uuid4())
|
| 73 |
-
token_count = self._count_tokens(chunk_content)
|
| 74 |
-
|
| 75 |
-
chunk = TextChunk(
|
| 76 |
-
id=chunk_id,
|
| 77 |
-
content=chunk_content,
|
| 78 |
-
title=title,
|
| 79 |
-
chapter=chapter,
|
| 80 |
-
section=section,
|
| 81 |
-
page_reference=file_path,
|
| 82 |
-
token_count=token_count,
|
| 83 |
-
original_start_pos=start_pos,
|
| 84 |
-
original_end_pos=end_pos
|
| 85 |
-
)
|
| 86 |
-
final_chunks.append(chunk)
|
| 87 |
-
|
| 88 |
-
return final_chunks
|
| 89 |
-
|
| 90 |
-
def _identify_structure(self, content: str) -> List[Tuple[int, str]]:
|
| 91 |
-
"""
|
| 92 |
-
Identify structural boundaries in the content (headings)
|
| 93 |
-
Returns list of (position, heading_text) tuples
|
| 94 |
-
"""
|
| 95 |
-
lines = content.split('\n')
|
| 96 |
-
structure = []
|
| 97 |
-
pos = 0
|
| 98 |
-
|
| 99 |
-
for line in lines:
|
| 100 |
-
# Check for markdown headings
|
| 101 |
-
heading_match = re.match(r'^(\#{1,6})\s+(.+)', line)
|
| 102 |
-
if heading_match:
|
| 103 |
-
level = len(heading_match.group(1))
|
| 104 |
-
heading_text = heading_match.group(2).strip()
|
| 105 |
-
structure.append((pos, f"{'#' * level} {heading_text}"))
|
| 106 |
-
|
| 107 |
-
pos += len(line) + 1 # +1 for newline
|
| 108 |
-
|
| 109 |
-
return structure
|
| 110 |
-
|
| 111 |
-
def _split_by_structure(self, content: str, structure: List[Tuple[int, str]]) -> List[Tuple[int, int, str]]:
|
| 112 |
-
"""
|
| 113 |
-
Split content by structural boundaries
|
| 114 |
-
Returns list of (start_pos, end_pos, heading_context) tuples
|
| 115 |
-
"""
|
| 116 |
-
if not structure:
|
| 117 |
-
return [(0, len(content), "")]
|
| 118 |
-
|
| 119 |
-
splits = []
|
| 120 |
-
start_pos = 0
|
| 121 |
-
|
| 122 |
-
for pos, heading in structure:
|
| 123 |
-
if pos > start_pos:
|
| 124 |
-
# Add the chunk before this heading
|
| 125 |
-
splits.append((start_pos, pos, heading))
|
| 126 |
-
start_pos = pos
|
| 127 |
-
|
| 128 |
-
# Add the final chunk if there's remaining content
|
| 129 |
-
if start_pos < len(content):
|
| 130 |
-
splits.append((start_pos, len(content), structure[-1][1]))
|
| 131 |
-
|
| 132 |
-
return splits
|
| 133 |
-
|
| 134 |
-
def _chunk_by_size(self, content: str, title: str, chapter: str, section: str, file_path: str) -> List[TextChunk]:
|
| 135 |
-
"""
|
| 136 |
-
Fallback method to chunk content by size when no structure is available
|
| 137 |
-
"""
|
| 138 |
-
chunks = []
|
| 139 |
-
tokens_per_chunk = self._estimate_tokens_per_chunk()
|
| 140 |
-
chunk_size = tokens_per_chunk * 4 # Rough estimate: 4 chars per token
|
| 141 |
-
|
| 142 |
-
for i in range(0, len(content), chunk_size):
|
| 143 |
-
chunk_content = content[i:i + chunk_size]
|
| 144 |
-
chunk_id = str(uuid.uuid4())
|
| 145 |
-
|
| 146 |
-
chunk = TextChunk(
|
| 147 |
-
id=chunk_id,
|
| 148 |
-
content=chunk_content,
|
| 149 |
-
title=title,
|
| 150 |
-
chapter=chapter,
|
| 151 |
-
section=section,
|
| 152 |
-
page_reference=file_path,
|
| 153 |
-
token_count=self._count_tokens(chunk_content),
|
| 154 |
-
original_start_pos=i,
|
| 155 |
-
original_end_pos=min(i + chunk_size, len(content))
|
| 156 |
-
)
|
| 157 |
-
chunks.append(chunk)
|
| 158 |
-
|
| 159 |
-
return chunks
|
| 160 |
-
|
| 161 |
-
def _chunk_large_section(
|
| 162 |
-
self,
|
| 163 |
-
content: str,
|
| 164 |
-
title: str,
|
| 165 |
-
chapter: str,
|
| 166 |
-
section: str,
|
| 167 |
-
file_path: str,
|
| 168 |
-
heading_context: str,
|
| 169 |
-
offset: int
|
| 170 |
-
) -> List[TextChunk]:
|
| 171 |
-
"""
|
| 172 |
-
Further chunk a large section while preserving heading context
|
| 173 |
-
"""
|
| 174 |
-
chunks = []
|
| 175 |
-
|
| 176 |
-
# If we have heading context, prepend it to each chunk
|
| 177 |
-
context_prefix = f"{heading_context}\n\n" if heading_context else ""
|
| 178 |
-
|
| 179 |
-
# Split content into sentences to find good break points
|
| 180 |
-
sentences = self._split_into_sentences(content)
|
| 181 |
-
|
| 182 |
-
current_chunk = ""
|
| 183 |
-
current_tokens = 0
|
| 184 |
-
chunk_idx = 0
|
| 185 |
-
|
| 186 |
-
for sentence in sentences:
|
| 187 |
-
sentence_tokens = self._count_tokens(sentence)
|
| 188 |
-
|
| 189 |
-
# If adding this sentence would exceed the token limit
|
| 190 |
-
if current_tokens + sentence_tokens > self.max_tokens and current_chunk:
|
| 191 |
-
# Save the current chunk
|
| 192 |
-
chunk_id = str(uuid.uuid4())
|
| 193 |
-
chunk = TextChunk(
|
| 194 |
-
id=chunk_id,
|
| 195 |
-
content=context_prefix + current_chunk,
|
| 196 |
-
title=title,
|
| 197 |
-
chapter=chapter,
|
| 198 |
-
section=section,
|
| 199 |
-
page_reference=file_path,
|
| 200 |
-
token_count=current_tokens,
|
| 201 |
-
original_start_pos=offset + content.find(current_chunk),
|
| 202 |
-
original_end_pos=offset + content.find(current_chunk) + len(current_chunk)
|
| 203 |
-
)
|
| 204 |
-
chunks.append(chunk)
|
| 205 |
-
|
| 206 |
-
# Start a new chunk with potential overlap
|
| 207 |
-
overlap_size = int(len(current_chunk) * self.overlap_ratio)
|
| 208 |
-
overlap_content = current_chunk[-overlap_size:] if overlap_size > 0 else ""
|
| 209 |
-
|
| 210 |
-
current_chunk = overlap_content + sentence
|
| 211 |
-
current_tokens = self._count_tokens(context_prefix + current_chunk)
|
| 212 |
-
chunk_idx += 1
|
| 213 |
-
else:
|
| 214 |
-
# Add sentence to current chunk
|
| 215 |
-
current_chunk += sentence
|
| 216 |
-
current_tokens += sentence_tokens
|
| 217 |
-
|
| 218 |
-
# Add the final chunk if it has content
|
| 219 |
-
if current_chunk.strip():
|
| 220 |
-
chunk_id = str(uuid.uuid4())
|
| 221 |
-
chunk = TextChunk(
|
| 222 |
-
id=chunk_id,
|
| 223 |
-
content=context_prefix + current_chunk,
|
| 224 |
-
title=title,
|
| 225 |
-
chapter=chapter,
|
| 226 |
-
section=section,
|
| 227 |
-
page_reference=file_path,
|
| 228 |
-
token_count=self._count_tokens(context_prefix + current_chunk),
|
| 229 |
-
original_start_pos=offset + content.find(current_chunk),
|
| 230 |
-
original_end_pos=offset + content.find(current_chunk) + len(current_chunk)
|
| 231 |
-
)
|
| 232 |
-
chunks.append(chunk)
|
| 233 |
-
|
| 234 |
-
return chunks
|
| 235 |
-
|
| 236 |
-
def _split_into_sentences(self, text: str) -> List[str]:
|
| 237 |
-
"""
|
| 238 |
-
Split text into sentences using common sentence boundaries
|
| 239 |
-
"""
|
| 240 |
-
# Use regex to split on sentence boundaries while preserving the delimiters
|
| 241 |
-
sentence_pattern = r'(?<=[.!?])\s+'
|
| 242 |
-
sentences = re.split(sentence_pattern, text)
|
| 243 |
-
|
| 244 |
-
# Re-add the punctuation to each sentence
|
| 245 |
-
result = []
|
| 246 |
-
for i, sentence in enumerate(sentences):
|
| 247 |
-
if i < len(sentences) - 1:
|
| 248 |
-
# Check if the original text had punctuation at the end of this sentence
|
| 249 |
-
# by looking at the character that followed this sentence in the original text
|
| 250 |
-
next_char_idx = len(''.join(sentences[:i+1])) + i # +i for spaces
|
| 251 |
-
if next_char_idx < len(text):
|
| 252 |
-
next_char = text[next_char_idx] if next_char_idx < len(text) else ''
|
| 253 |
-
if next_char in '.!?':
|
| 254 |
-
sentence += next_char
|
| 255 |
-
result.append(sentence + ' ')
|
| 256 |
-
|
| 257 |
-
# Clean up and ensure each sentence is properly formatted
|
| 258 |
-
result = [s.strip() for s in result if s.strip()]
|
| 259 |
-
return result
|
| 260 |
-
|
| 261 |
-
def _count_tokens(self, text: str) -> int:
|
| 262 |
-
"""
|
| 263 |
-
Count approximate number of tokens in text
|
| 264 |
-
This is a simple estimation; for more accurate counting, use tiktoken
|
| 265 |
-
"""
|
| 266 |
-
import tiktoken
|
| 267 |
-
# Use cl100k_base encoding which is used by many OpenAI models
|
| 268 |
-
encoding = tiktoken.get_encoding("cl100k_base")
|
| 269 |
-
return len(encoding.encode(text))
|
| 270 |
-
|
| 271 |
-
def _estimate_tokens_per_chunk(self) -> int:
|
| 272 |
-
"""
|
| 273 |
-
Estimate number of tokens that would fit in a chunk based on max_tokens
|
| 274 |
-
"""
|
| 275 |
-
# This is a rough estimation - in practice, you might want to use
|
| 276 |
-
# a more sophisticated approach based on your specific content
|
| 277 |
-
return min(self.max_tokens, 800) # Conservative estimate
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
def chunk_documents(documents: List[Dict]) -> List[TextChunk]:
|
| 281 |
-
"""
|
| 282 |
-
Convenience function to chunk a list of documents
|
| 283 |
-
"""
|
| 284 |
-
chunker = TextChunker()
|
| 285 |
-
all_chunks = []
|
| 286 |
-
|
| 287 |
-
for document in documents:
|
| 288 |
-
chunks = chunker.chunk_document(document)
|
| 289 |
-
all_chunks.extend(chunks)
|
| 290 |
-
|
| 291 |
-
return all_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/ingestion/document_parser.py
DELETED
|
@@ -1,146 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import re
|
| 3 |
-
from typing import List, Dict, Optional
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
import markdown
|
| 6 |
-
from bs4 import BeautifulSoup
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
class DocumentParser:
|
| 10 |
-
"""
|
| 11 |
-
Parses markdown files and extracts content with structural information
|
| 12 |
-
"""
|
| 13 |
-
|
| 14 |
-
def __init__(self):
|
| 15 |
-
pass
|
| 16 |
-
|
| 17 |
-
def parse_markdown_file(self, file_path: str) -> Dict:
|
| 18 |
-
"""
|
| 19 |
-
Parse a markdown file and extract content with structural information
|
| 20 |
-
"""
|
| 21 |
-
with open(file_path, 'r', encoding='utf-8') as file:
|
| 22 |
-
content = file.read()
|
| 23 |
-
|
| 24 |
-
# Extract metadata from frontmatter if present
|
| 25 |
-
metadata = self._extract_frontmatter(content)
|
| 26 |
-
|
| 27 |
-
# Extract structural information (headings)
|
| 28 |
-
structure = self._extract_structure(content)
|
| 29 |
-
|
| 30 |
-
# Get clean content without frontmatter
|
| 31 |
-
clean_content = self._remove_frontmatter(content)
|
| 32 |
-
|
| 33 |
-
# Extract title from the first heading or filename
|
| 34 |
-
title = metadata.get('title') or self._extract_title(clean_content) or Path(file_path).stem
|
| 35 |
-
|
| 36 |
-
# Determine chapter/section from file path
|
| 37 |
-
path_parts = Path(file_path).parts
|
| 38 |
-
chapter = self._extract_chapter_info(path_parts)
|
| 39 |
-
section = self._extract_section_info(path_parts)
|
| 40 |
-
|
| 41 |
-
return {
|
| 42 |
-
'title': title,
|
| 43 |
-
'content': clean_content,
|
| 44 |
-
'chapter': chapter,
|
| 45 |
-
'section': section,
|
| 46 |
-
'file_path': file_path,
|
| 47 |
-
'metadata': metadata,
|
| 48 |
-
'structure': structure
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
-
def _extract_frontmatter(self, content: str) -> Dict:
|
| 52 |
-
"""
|
| 53 |
-
Extract YAML frontmatter from markdown content
|
| 54 |
-
"""
|
| 55 |
-
import yaml
|
| 56 |
-
|
| 57 |
-
# Look for YAML frontmatter between --- delimiters
|
| 58 |
-
frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
|
| 59 |
-
|
| 60 |
-
if frontmatter_match:
|
| 61 |
-
try:
|
| 62 |
-
frontmatter = yaml.safe_load(frontmatter_match.group(1))
|
| 63 |
-
return frontmatter or {}
|
| 64 |
-
except yaml.YAMLError:
|
| 65 |
-
return {}
|
| 66 |
-
|
| 67 |
-
return {}
|
| 68 |
-
|
| 69 |
-
def _remove_frontmatter(self, content: str) -> str:
|
| 70 |
-
"""
|
| 71 |
-
Remove YAML frontmatter from content
|
| 72 |
-
"""
|
| 73 |
-
frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
|
| 74 |
-
|
| 75 |
-
if frontmatter_match:
|
| 76 |
-
return content[frontmatter_match.end():]
|
| 77 |
-
|
| 78 |
-
return content
|
| 79 |
-
|
| 80 |
-
def _extract_structure(self, content: str) -> List[Dict]:
|
| 81 |
-
"""
|
| 82 |
-
Extract structural information (headings) from markdown content
|
| 83 |
-
"""
|
| 84 |
-
# Convert markdown to HTML to easily extract headings
|
| 85 |
-
html = markdown.markdown(content)
|
| 86 |
-
soup = BeautifulSoup(html, 'html.parser')
|
| 87 |
-
|
| 88 |
-
structure = []
|
| 89 |
-
for i, heading in enumerate(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])):
|
| 90 |
-
structure.append({
|
| 91 |
-
'level': int(heading.name[1]),
|
| 92 |
-
'text': heading.get_text().strip(),
|
| 93 |
-
'position': i
|
| 94 |
-
})
|
| 95 |
-
|
| 96 |
-
return structure
|
| 97 |
-
|
| 98 |
-
def _extract_title(self, content: str) -> Optional[str]:
|
| 99 |
-
"""
|
| 100 |
-
Extract title from the first heading in the content
|
| 101 |
-
"""
|
| 102 |
-
lines = content.split('\n')
|
| 103 |
-
for line in lines:
|
| 104 |
-
# Check for markdown heading pattern
|
| 105 |
-
heading_match = re.match(r'^#+\s+(.+)', line)
|
| 106 |
-
if heading_match:
|
| 107 |
-
return heading_match.group(1).strip()
|
| 108 |
-
|
| 109 |
-
return None
|
| 110 |
-
|
| 111 |
-
def _extract_chapter_info(self, path_parts: tuple) -> str:
|
| 112 |
-
"""
|
| 113 |
-
Extract chapter information from file path
|
| 114 |
-
"""
|
| 115 |
-
# Look for common chapter-related directory names
|
| 116 |
-
for part in path_parts:
|
| 117 |
-
if 'chapter' in part.lower() or 'module' in part.lower():
|
| 118 |
-
return part
|
| 119 |
-
|
| 120 |
-
# If no chapter directory found, use the directory name
|
| 121 |
-
if len(path_parts) > 1:
|
| 122 |
-
return path_parts[-2] # Parent directory of the file
|
| 123 |
-
|
| 124 |
-
return 'unknown'
|
| 125 |
-
|
| 126 |
-
def _extract_section_info(self, path_parts: tuple) -> str:
|
| 127 |
-
"""
|
| 128 |
-
Extract section information from file path
|
| 129 |
-
"""
|
| 130 |
-
file_name = path_parts[-1]
|
| 131 |
-
# Remove file extension
|
| 132 |
-
section = Path(file_name).stem
|
| 133 |
-
return section
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
def scan_markdown_files(directory: str) -> List[str]:
|
| 137 |
-
"""
|
| 138 |
-
Scan a directory for markdown files
|
| 139 |
-
"""
|
| 140 |
-
markdown_files = []
|
| 141 |
-
for root, dirs, files in os.walk(directory):
|
| 142 |
-
for file in files:
|
| 143 |
-
if file.lower().endswith(('.md', '.markdown')):
|
| 144 |
-
markdown_files.append(os.path.join(root, file))
|
| 145 |
-
|
| 146 |
-
return markdown_files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/ingestion/file_scanner.py
DELETED
|
@@ -1,92 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from typing import List, Dict, Optional
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from .document_parser import DocumentParser, scan_markdown_files
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
class FileScanner:
|
| 8 |
-
"""
|
| 9 |
-
Scans and processes markdown files from the Docusaurus documentation directory
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
def __init__(self, base_path: str = "docusaurus/docs"):
|
| 13 |
-
self.base_path = base_path
|
| 14 |
-
self.parser = DocumentParser()
|
| 15 |
-
|
| 16 |
-
def scan_and_parse_documents(self) -> List[Dict]:
|
| 17 |
-
"""
|
| 18 |
-
Scan the documentation directory and parse all markdown files
|
| 19 |
-
"""
|
| 20 |
-
if not os.path.exists(self.base_path):
|
| 21 |
-
raise FileNotFoundError(f"Documentation directory not found: {self.base_path}")
|
| 22 |
-
|
| 23 |
-
markdown_files = scan_markdown_files(self.base_path)
|
| 24 |
-
documents = []
|
| 25 |
-
|
| 26 |
-
for file_path in markdown_files:
|
| 27 |
-
try:
|
| 28 |
-
document = self.parser.parse_markdown_file(file_path)
|
| 29 |
-
documents.append(document)
|
| 30 |
-
except Exception as e:
|
| 31 |
-
print(f"Error parsing file {file_path}: {str(e)}")
|
| 32 |
-
continue
|
| 33 |
-
|
| 34 |
-
return documents
|
| 35 |
-
|
| 36 |
-
def validate_document(self, document: Dict) -> bool:
|
| 37 |
-
"""
|
| 38 |
-
Validate document structure and content
|
| 39 |
-
"""
|
| 40 |
-
required_fields = ['title', 'content', 'chapter', 'section', 'file_path']
|
| 41 |
-
for field in required_fields:
|
| 42 |
-
if field not in document or not document[field]:
|
| 43 |
-
return False
|
| 44 |
-
|
| 45 |
-
# Check content length
|
| 46 |
-
if len(document['content'].strip()) < 10:
|
| 47 |
-
return False
|
| 48 |
-
|
| 49 |
-
return True
|
| 50 |
-
|
| 51 |
-
def get_document_stats(self, documents: List[Dict]) -> Dict:
|
| 52 |
-
"""
|
| 53 |
-
Get statistics about the parsed documents
|
| 54 |
-
"""
|
| 55 |
-
total_docs = len(documents)
|
| 56 |
-
valid_docs = sum(1 for doc in documents if self.validate_document(doc))
|
| 57 |
-
total_chars = sum(len(doc['content']) for doc in documents)
|
| 58 |
-
unique_chapters = len(set(doc['chapter'] for doc in documents))
|
| 59 |
-
|
| 60 |
-
return {
|
| 61 |
-
'total_documents': total_docs,
|
| 62 |
-
'valid_documents': valid_docs,
|
| 63 |
-
'invalid_documents': total_docs - valid_docs,
|
| 64 |
-
'total_characters': total_chars,
|
| 65 |
-
'unique_chapters': unique_chapters,
|
| 66 |
-
'average_length': total_chars // total_docs if total_docs > 0 else 0
|
| 67 |
-
}
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
def main():
|
| 71 |
-
"""
|
| 72 |
-
Main function to demonstrate file scanning
|
| 73 |
-
"""
|
| 74 |
-
# Use the docusaurus docs directory by default, or allow override
|
| 75 |
-
scanner = FileScanner()
|
| 76 |
-
documents = scanner.scan_and_parse_documents()
|
| 77 |
-
|
| 78 |
-
print(f"Found {len(documents)} documents")
|
| 79 |
-
stats = scanner.get_document_stats(documents)
|
| 80 |
-
print(f"Statistics: {stats}")
|
| 81 |
-
|
| 82 |
-
# Print first document as example
|
| 83 |
-
if documents:
|
| 84 |
-
print(f"\nFirst document example:")
|
| 85 |
-
print(f"Title: {documents[0]['title']}")
|
| 86 |
-
print(f"Chapter: {documents[0]['chapter']}")
|
| 87 |
-
print(f"Section: {documents[0]['section']}")
|
| 88 |
-
print(f"Content preview: {documents[0]['content'][:200]}...")
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
if __name__ == "__main__":
|
| 92 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/main.py
DELETED
|
@@ -1,44 +0,0 @@
|
|
| 1 |
-
from fastapi import FastAPI
|
| 2 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
-
|
| 4 |
-
app = FastAPI(
|
| 5 |
-
title="RAG Chatbot API",
|
| 6 |
-
description="API for RAG-based question answering for Physical AI & Humanoid Robotics book",
|
| 7 |
-
version="1.0.0"
|
| 8 |
-
)
|
| 9 |
-
|
| 10 |
-
# Add CORS middleware
|
| 11 |
-
app.add_middleware(
|
| 12 |
-
CORSMiddleware,
|
| 13 |
-
allow_origins=["*"], # In production, replace with specific origins
|
| 14 |
-
allow_credentials=True,
|
| 15 |
-
allow_methods=["*"],
|
| 16 |
-
allow_headers=["*"],
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
@app.get("/")
|
| 20 |
-
async def root():
|
| 21 |
-
return {"message": "RAG Chatbot API is running!"}
|
| 22 |
-
|
| 23 |
-
@app.get("/health")
|
| 24 |
-
async def health_check():
|
| 25 |
-
return {"status": "healthy", "service": "RAG Chatbot API"}
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
@app.on_event("startup")
|
| 29 |
-
async def startup_event():
|
| 30 |
-
"""
|
| 31 |
-
Create database tables on startup
|
| 32 |
-
"""
|
| 33 |
-
from app.database.models import create_tables
|
| 34 |
-
create_tables() # Create tables with correct schema (using checkfirst to avoid errors)
|
| 35 |
-
|
| 36 |
-
# Include API routes
|
| 37 |
-
from app.api import health, ingest, chat
|
| 38 |
-
app.include_router(health.router, prefix="/api")
|
| 39 |
-
app.include_router(ingest.router, prefix="/api")
|
| 40 |
-
app.include_router(chat.router, prefix="/api")
|
| 41 |
-
|
| 42 |
-
if __name__ == "__main__":
|
| 43 |
-
import uvicorn
|
| 44 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/models/__pycache__/chat.cpython-312.pyc
DELETED
|
Binary file (2.76 kB)
|
|
|
app/models/chat.py
DELETED
|
@@ -1,69 +0,0 @@
|
|
| 1 |
-
from pydantic import BaseModel
|
| 2 |
-
from typing import List, Dict, Optional
|
| 3 |
-
from datetime import datetime
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
class ChatSession(BaseModel):
|
| 7 |
-
"""
|
| 8 |
-
Model for chat session data
|
| 9 |
-
"""
|
| 10 |
-
session_id: str
|
| 11 |
-
user_id: Optional[str] = None
|
| 12 |
-
created_at: datetime
|
| 13 |
-
updated_at: datetime
|
| 14 |
-
metadata: Optional[Dict] = None
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
class ChatMessage(BaseModel):
|
| 18 |
-
"""
|
| 19 |
-
Model for chat message data
|
| 20 |
-
"""
|
| 21 |
-
message_id: str
|
| 22 |
-
session_id: str
|
| 23 |
-
role: str # "user" or "assistant"
|
| 24 |
-
content: str
|
| 25 |
-
citations: Optional[List[Dict[str, str]]] = None
|
| 26 |
-
query_context_id: Optional[str] = None
|
| 27 |
-
timestamp: datetime
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
class QueryContext(BaseModel):
|
| 31 |
-
"""
|
| 32 |
-
Model for query context data
|
| 33 |
-
"""
|
| 34 |
-
context_id: str
|
| 35 |
-
session_id: str
|
| 36 |
-
selected_text: Optional[str] = None
|
| 37 |
-
query_type: str # "global" or "selection"
|
| 38 |
-
created_at: datetime
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
class ChatRequest(BaseModel):
|
| 42 |
-
"""
|
| 43 |
-
Model for chat API request
|
| 44 |
-
"""
|
| 45 |
-
session_id: str
|
| 46 |
-
message: str
|
| 47 |
-
selected_text: Optional[str] = None
|
| 48 |
-
query_type: str = "global" # "global" or "selection"
|
| 49 |
-
top_k: int = 5
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
class ChatResponse(BaseModel):
|
| 53 |
-
"""
|
| 54 |
-
Model for chat API response
|
| 55 |
-
"""
|
| 56 |
-
response: str
|
| 57 |
-
citations: List[Dict[str, str]]
|
| 58 |
-
session_id: str
|
| 59 |
-
query_type: str
|
| 60 |
-
timestamp: str
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
class ChatHistoryResponse(BaseModel):
|
| 64 |
-
"""
|
| 65 |
-
Model for chat history API response
|
| 66 |
-
"""
|
| 67 |
-
session_id: str
|
| 68 |
-
messages: List[ChatMessage]
|
| 69 |
-
timestamp: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/prompting/__pycache__/context_filter.cpython-312.pyc
DELETED
|
Binary file (8.18 kB)
|
|
|
app/prompting/__pycache__/prompt_builder.cpython-312.pyc
DELETED
|
Binary file (7.12 kB)
|
|
|
app/prompting/context_filter.py
DELETED
|
@@ -1,205 +0,0 @@
|
|
| 1 |
-
from typing import List, Dict, Any, Optional
|
| 2 |
-
from app.retrieval.retriever import Retriever
|
| 3 |
-
from app.prompting.prompt_builder import PromptBuilder
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
class ContextFilter:
|
| 7 |
-
"""
|
| 8 |
-
Filters and validates contexts to prevent information leakage between query types
|
| 9 |
-
"""
|
| 10 |
-
|
| 11 |
-
def __init__(self):
|
| 12 |
-
self.retriever = Retriever()
|
| 13 |
-
self.prompt_builder = PromptBuilder()
|
| 14 |
-
|
| 15 |
-
def filter_context_for_query_type(
|
| 16 |
-
self,
|
| 17 |
-
contexts: List[Dict[str, Any]],
|
| 18 |
-
query_type: str,
|
| 19 |
-
selected_text: Optional[str] = None
|
| 20 |
-
) -> List[Dict[str, Any]]:
|
| 21 |
-
"""
|
| 22 |
-
Filter contexts based on query type to prevent information leakage
|
| 23 |
-
"""
|
| 24 |
-
if query_type == "selection" and selected_text:
|
| 25 |
-
# For selection-based queries, we need to ensure contexts are relevant
|
| 26 |
-
# to the selected text and don't introduce unrelated global knowledge
|
| 27 |
-
return self._filter_selection_contexts(contexts, selected_text)
|
| 28 |
-
elif query_type == "global":
|
| 29 |
-
# For global queries, we can use all retrieved contexts
|
| 30 |
-
return contexts
|
| 31 |
-
else:
|
| 32 |
-
# Default to global behavior
|
| 33 |
-
return contexts
|
| 34 |
-
|
| 35 |
-
def _filter_selection_contexts(
|
| 36 |
-
self,
|
| 37 |
-
contexts: List[Dict[str, Any]],
|
| 38 |
-
selected_text: str
|
| 39 |
-
) -> List[Dict[str, Any]]:
|
| 40 |
-
"""
|
| 41 |
-
Filter contexts to ensure they're relevant to the selected text for selection-based queries
|
| 42 |
-
"""
|
| 43 |
-
if not contexts or not selected_text:
|
| 44 |
-
return contexts
|
| 45 |
-
|
| 46 |
-
# Simple relevance check: ensure contexts have some connection to the selected text
|
| 47 |
-
# In a more sophisticated implementation, you might use semantic similarity
|
| 48 |
-
selected_keywords = set(selected_text.lower().split()[:10]) # Use first 10 words as keywords
|
| 49 |
-
filtered_contexts = []
|
| 50 |
-
|
| 51 |
-
for context in contexts:
|
| 52 |
-
content = context.get('content', '').lower()
|
| 53 |
-
content_words = set(content.split())
|
| 54 |
-
|
| 55 |
-
# Check if there's significant overlap in keywords
|
| 56 |
-
keyword_overlap = len(selected_keywords.intersection(content_words))
|
| 57 |
-
keyword_ratio = keyword_overlap / len(selected_keywords) if selected_keywords else 0
|
| 58 |
-
|
| 59 |
-
# Include context if it has some relevance to the selected text
|
| 60 |
-
# or if we don't have enough contexts yet
|
| 61 |
-
if keyword_ratio > 0.1 or len(filtered_contexts) < 2: # At least 10% overlap or include first few
|
| 62 |
-
filtered_contexts.append(context)
|
| 63 |
-
|
| 64 |
-
return filtered_contexts
|
| 65 |
-
|
| 66 |
-
def validate_context_isolation(
|
| 67 |
-
self,
|
| 68 |
-
contexts: List[Dict[str, Any]],
|
| 69 |
-
query_type: str,
|
| 70 |
-
selected_text: Optional[str] = None
|
| 71 |
-
) -> Dict[str, Any]:
|
| 72 |
-
"""
|
| 73 |
-
Validate that contexts are properly isolated based on query type
|
| 74 |
-
"""
|
| 75 |
-
validation_result = {
|
| 76 |
-
'is_valid': True,
|
| 77 |
-
'query_type': query_type,
|
| 78 |
-
'context_count': len(contexts),
|
| 79 |
-
'issues': []
|
| 80 |
-
}
|
| 81 |
-
|
| 82 |
-
if query_type == "selection" and selected_text:
|
| 83 |
-
# Validate that contexts are related to selected text
|
| 84 |
-
validation_result.update(self._validate_selection_contexts(contexts, selected_text))
|
| 85 |
-
elif query_type == "global":
|
| 86 |
-
# For global queries, validate that we have diverse contexts
|
| 87 |
-
validation_result.update(self._validate_global_contexts(contexts))
|
| 88 |
-
|
| 89 |
-
return validation_result
|
| 90 |
-
|
| 91 |
-
def _validate_selection_contexts(
|
| 92 |
-
self,
|
| 93 |
-
contexts: List[Dict[str, Any]],
|
| 94 |
-
selected_text: str
|
| 95 |
-
) -> Dict[str, Any]:
|
| 96 |
-
"""
|
| 97 |
-
Validate selection-based contexts
|
| 98 |
-
"""
|
| 99 |
-
result = {
|
| 100 |
-
'is_valid': True,
|
| 101 |
-
'issues': []
|
| 102 |
-
}
|
| 103 |
-
|
| 104 |
-
if not contexts:
|
| 105 |
-
result['is_valid'] = False
|
| 106 |
-
result['issues'].append("No contexts provided for selection-based query")
|
| 107 |
-
return result
|
| 108 |
-
|
| 109 |
-
# Check relevance to selected text
|
| 110 |
-
relevant_count = 0
|
| 111 |
-
selected_keywords = set(selected_text.lower().split()[:10])
|
| 112 |
-
|
| 113 |
-
for context in contexts:
|
| 114 |
-
content = context.get('content', '').lower()
|
| 115 |
-
content_words = set(content.split())
|
| 116 |
-
keyword_overlap = len(selected_keywords.intersection(content_words))
|
| 117 |
-
|
| 118 |
-
if keyword_overlap > 0:
|
| 119 |
-
relevant_count += 1
|
| 120 |
-
|
| 121 |
-
relevance_ratio = relevant_count / len(contexts)
|
| 122 |
-
if relevance_ratio < 0.3: # Less than 30% of contexts are relevant
|
| 123 |
-
result['is_valid'] = False
|
| 124 |
-
result['issues'].append(
|
| 125 |
-
f"Only {relevant_count}/{len(contexts)} contexts ({relevance_ratio:.1%}) "
|
| 126 |
-
f"are relevant to the selected text"
|
| 127 |
-
)
|
| 128 |
-
|
| 129 |
-
return result
|
| 130 |
-
|
| 131 |
-
def _validate_global_contexts(self, contexts: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 132 |
-
"""
|
| 133 |
-
Validate global contexts
|
| 134 |
-
"""
|
| 135 |
-
result = {
|
| 136 |
-
'is_valid': True,
|
| 137 |
-
'issues': []
|
| 138 |
-
}
|
| 139 |
-
|
| 140 |
-
if not contexts:
|
| 141 |
-
result['is_valid'] = False
|
| 142 |
-
result['issues'].append("No contexts provided for global query")
|
| 143 |
-
return result
|
| 144 |
-
|
| 145 |
-
# Check for diversity in chapters/sections
|
| 146 |
-
unique_chapters = set(ctx.get('chapter', '') for ctx in contexts)
|
| 147 |
-
unique_sections = set(ctx.get('section', '') for ctx in contexts)
|
| 148 |
-
|
| 149 |
-
if len(unique_chapters) < 2 and len(contexts) > 1:
|
| 150 |
-
result['issues'].append("Contexts lack diversity - all from same chapter")
|
| 151 |
-
|
| 152 |
-
return result
|
| 153 |
-
|
| 154 |
-
def enforce_context_boundaries(
|
| 155 |
-
self,
|
| 156 |
-
contexts: List[Dict[str, Any]],
|
| 157 |
-
query_type: str,
|
| 158 |
-
selected_text: Optional[str] = None,
|
| 159 |
-
max_contexts: int = 5
|
| 160 |
-
) -> List[Dict[str, Any]]:
|
| 161 |
-
"""
|
| 162 |
-
Enforce strict boundaries on contexts to prevent information leakage
|
| 163 |
-
"""
|
| 164 |
-
# First, filter based on query type
|
| 165 |
-
filtered_contexts = self.filter_context_for_query_type(contexts, query_type, selected_text)
|
| 166 |
-
|
| 167 |
-
# Then enforce maximum count
|
| 168 |
-
if len(filtered_contexts) > max_contexts:
|
| 169 |
-
filtered_contexts = filtered_contexts[:max_contexts]
|
| 170 |
-
|
| 171 |
-
# Validate the final contexts
|
| 172 |
-
validation = self.validate_context_isolation(filtered_contexts, query_type, selected_text)
|
| 173 |
-
|
| 174 |
-
if not validation['is_valid']:
|
| 175 |
-
print(f"Context validation warning: {validation['issues']}")
|
| 176 |
-
|
| 177 |
-
return filtered_contexts
|
| 178 |
-
|
| 179 |
-
def build_isolated_context_string(
|
| 180 |
-
self,
|
| 181 |
-
contexts: List[Dict[str, Any]],
|
| 182 |
-
query_type: str,
|
| 183 |
-
selected_text: Optional[str] = None
|
| 184 |
-
) -> str:
|
| 185 |
-
"""
|
| 186 |
-
Build a context string that enforces isolation between query types
|
| 187 |
-
"""
|
| 188 |
-
if query_type == "selection" and selected_text:
|
| 189 |
-
# Build context string focused on selected text and related content
|
| 190 |
-
context_str = f"SELECTED TEXT:\n{selected_text}\n\nRELATED CONTENT:\n"
|
| 191 |
-
for i, ctx in enumerate(contexts):
|
| 192 |
-
context_str += f"[{i+1}] {ctx.get('content', '')}\n"
|
| 193 |
-
context_str += f"Source: {ctx.get('chapter', '')} - {ctx.get('section', '')}\n\n"
|
| 194 |
-
else:
|
| 195 |
-
# Build global context string
|
| 196 |
-
context_str = "RETRIEVED CONTENT:\n"
|
| 197 |
-
for i, ctx in enumerate(contexts):
|
| 198 |
-
context_str += f"[{i+1}] {ctx.get('content', '')}\n"
|
| 199 |
-
context_str += f"Source: {ctx.get('chapter', '')} - {ctx.get('section', '')}\n\n"
|
| 200 |
-
|
| 201 |
-
return context_str
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
# Global instance
|
| 205 |
-
context_filter = ContextFilter()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/prompting/prompt_builder.py
DELETED
|
@@ -1,187 +0,0 @@
|
|
| 1 |
-
from typing import List, Dict, Any, Optional
|
| 2 |
-
from app.retrieval.retriever import Retriever
|
| 3 |
-
from app.config import settings
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
class PromptBuilder:
|
| 7 |
-
"""
|
| 8 |
-
Builds context-aware prompts for the LLM while preventing hallucinations
|
| 9 |
-
"""
|
| 10 |
-
|
| 11 |
-
def __init__(self):
|
| 12 |
-
self.retriever = Retriever()
|
| 13 |
-
|
| 14 |
-
def build_global_query_prompt(
|
| 15 |
-
self,
|
| 16 |
-
query: str,
|
| 17 |
-
retrieved_contexts: List[Dict[str, Any]],
|
| 18 |
-
max_context_length: int = 2000
|
| 19 |
-
) -> str:
|
| 20 |
-
"""
|
| 21 |
-
Build a prompt for global book queries using all retrieved contexts
|
| 22 |
-
"""
|
| 23 |
-
# Start with system message to prevent hallucinations
|
| 24 |
-
system_prompt = (
|
| 25 |
-
"You are an AI assistant that answers questions based only on the provided context. "
|
| 26 |
-
"Do not use any prior knowledge or information not present in the context. "
|
| 27 |
-
"If the answer cannot be found in the provided context, respond with: "
|
| 28 |
-
"'The provided context does not contain information to answer this question.'\n\n"
|
| 29 |
-
)
|
| 30 |
-
|
| 31 |
-
# Add retrieved contexts
|
| 32 |
-
context_section = "### CONTEXT:\n\n"
|
| 33 |
-
total_context_length = 0
|
| 34 |
-
|
| 35 |
-
for i, ctx in enumerate(retrieved_contexts):
|
| 36 |
-
if total_context_length >= max_context_length:
|
| 37 |
-
break
|
| 38 |
-
|
| 39 |
-
context_text = ctx.get('content', '')
|
| 40 |
-
# Truncate if too long
|
| 41 |
-
if len(context_text) + total_context_length > max_context_length:
|
| 42 |
-
available_length = max_context_length - total_context_length
|
| 43 |
-
context_text = context_text[:available_length]
|
| 44 |
-
|
| 45 |
-
context_section += f"**Source {i+1} ({ctx.get('title', 'Unknown')} - {ctx.get('chapter', 'Unknown')}):**\n"
|
| 46 |
-
context_section += f"{context_text}\n\n"
|
| 47 |
-
total_context_length += len(context_text)
|
| 48 |
-
|
| 49 |
-
# Add user query
|
| 50 |
-
user_query_section = f"### QUESTION:\n{query}\n\n"
|
| 51 |
-
|
| 52 |
-
# Add instruction for response format
|
| 53 |
-
response_format = (
|
| 54 |
-
"### INSTRUCTIONS:\n"
|
| 55 |
-
"1. Answer the question based ONLY on the provided context\n"
|
| 56 |
-
"2. If the context doesn't contain the answer, say so explicitly\n"
|
| 57 |
-
"3. Include relevant citations to the sources in your response\n"
|
| 58 |
-
"4. Keep your response concise and to the point\n\n"
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
-
# Combine all parts
|
| 62 |
-
full_prompt = system_prompt + context_section + user_query_section + response_format
|
| 63 |
-
return full_prompt
|
| 64 |
-
|
| 65 |
-
def build_selection_based_prompt(
|
| 66 |
-
self,
|
| 67 |
-
query: str,
|
| 68 |
-
selected_text: str,
|
| 69 |
-
retrieved_contexts: List[Dict[str, Any]],
|
| 70 |
-
max_context_length: int = 2000
|
| 71 |
-
) -> str:
|
| 72 |
-
"""
|
| 73 |
-
Build a prompt for selection-based queries using only the selected text and relevant contexts
|
| 74 |
-
"""
|
| 75 |
-
# Start with system message
|
| 76 |
-
system_prompt = (
|
| 77 |
-
"You are an AI assistant that answers questions based only on the provided selected text and related context. "
|
| 78 |
-
"Do not use any prior knowledge or information not present in the provided content. "
|
| 79 |
-
"Focus your answer on the relationship between the selected text and the question. "
|
| 80 |
-
"If the answer cannot be found in the provided content, respond with: "
|
| 81 |
-
"'The provided content does not contain information to answer this question.'\n\n"
|
| 82 |
-
)
|
| 83 |
-
|
| 84 |
-
# Add the selected text as primary context
|
| 85 |
-
primary_context = f"### SELECTED TEXT:\n{selected_text}\n\n"
|
| 86 |
-
|
| 87 |
-
# Add related contexts
|
| 88 |
-
related_context = "### RELATED CONTEXT:\n\n"
|
| 89 |
-
total_context_length = len(selected_text)
|
| 90 |
-
|
| 91 |
-
for i, ctx in enumerate(retrieved_contexts):
|
| 92 |
-
if total_context_length >= max_context_length:
|
| 93 |
-
break
|
| 94 |
-
|
| 95 |
-
context_text = ctx.get('content', '')
|
| 96 |
-
# Truncate if too long
|
| 97 |
-
if len(context_text) + total_context_length > max_context_length:
|
| 98 |
-
available_length = max_context_length - total_context_length
|
| 99 |
-
context_text = context_text[:available_length]
|
| 100 |
-
|
| 101 |
-
related_context += f"**Related Content {i+1} ({ctx.get('title', 'Unknown')} - {ctx.get('chapter', 'Unknown')}):**\n"
|
| 102 |
-
related_context += f"{context_text}\n\n"
|
| 103 |
-
total_context_length += len(context_text)
|
| 104 |
-
|
| 105 |
-
# Add user query
|
| 106 |
-
user_query_section = f"### QUESTION ABOUT SELECTED TEXT:\n{query}\n\n"
|
| 107 |
-
|
| 108 |
-
# Add instruction for response format
|
| 109 |
-
response_format = (
|
| 110 |
-
"### INSTRUCTIONS:\n"
|
| 111 |
-
"1. Answer the question based ONLY on the selected text and related context\n"
|
| 112 |
-
"2. Focus on how the question relates to the selected text\n"
|
| 113 |
-
"3. If the content doesn't contain the answer, say so explicitly\n"
|
| 114 |
-
"4. Include citations to the sources in your response\n"
|
| 115 |
-
"5. Keep your response concise and relevant to the selected text\n\n"
|
| 116 |
-
)
|
| 117 |
-
|
| 118 |
-
# Combine all parts
|
| 119 |
-
full_prompt = system_prompt + primary_context + related_context + user_query_section + response_format
|
| 120 |
-
return full_prompt
|
| 121 |
-
|
| 122 |
-
def build_context_filter_prompt(
|
| 123 |
-
self,
|
| 124 |
-
query: str,
|
| 125 |
-
available_contexts: List[Dict[str, Any]],
|
| 126 |
-
max_contexts_to_use: int = 3
|
| 127 |
-
) -> str:
|
| 128 |
-
"""
|
| 129 |
-
Build a prompt to filter relevant contexts from a larger set
|
| 130 |
-
"""
|
| 131 |
-
system_prompt = (
|
| 132 |
-
"You are an AI assistant that helps filter relevant contexts for a given question. "
|
| 133 |
-
"Analyze the question and the provided contexts, then select only the most relevant ones.\n\n"
|
| 134 |
-
)
|
| 135 |
-
|
| 136 |
-
# Add contexts
|
| 137 |
-
contexts_section = "### AVAILABLE CONTEXTS:\n\n"
|
| 138 |
-
for i, ctx in enumerate(available_contexts):
|
| 139 |
-
contexts_section += f"**Context {i+1} ({ctx.get('title', 'Unknown')} - {ctx.get('chapter', 'Unknown')}):**\n"
|
| 140 |
-
contexts_section += f"{ctx.get('content', '')[:500]}...\n\n" # Truncate for brevity
|
| 141 |
-
|
| 142 |
-
# Add query
|
| 143 |
-
query_section = f"### QUESTION:\n{query}\n\n"
|
| 144 |
-
|
| 145 |
-
# Add instructions
|
| 146 |
-
instruction_section = (
|
| 147 |
-
"### INSTRUCTIONS:\n"
|
| 148 |
-
"1. Identify which contexts are most relevant to answering the question\n"
|
| 149 |
-
"2. List the most relevant contexts by their number\n"
|
| 150 |
-
"3. Provide a brief reason for why each selected context is relevant\n"
|
| 151 |
-
f"4. Select at most {max_contexts_to_use} contexts\n\n"
|
| 152 |
-
)
|
| 153 |
-
|
| 154 |
-
# Add response format
|
| 155 |
-
response_format = (
|
| 156 |
-
"### RESPONSE FORMAT:\n"
|
| 157 |
-
"Respond with only the following JSON format:\n"
|
| 158 |
-
"{\n"
|
| 159 |
-
" \"relevant_contexts\": [\n"
|
| 160 |
-
" {\n"
|
| 161 |
-
" \"index\": 0,\n"
|
| 162 |
-
" \"reason\": \"Brief explanation of relevance\"\n"
|
| 163 |
-
" }\n"
|
| 164 |
-
" ]\n"
|
| 165 |
-
"}\n"
|
| 166 |
-
)
|
| 167 |
-
|
| 168 |
-
full_prompt = system_prompt + contexts_section + query_section + instruction_section + response_format
|
| 169 |
-
return full_prompt
|
| 170 |
-
|
| 171 |
-
def validate_prompt_context_isolation(self, prompt: str, query_type: str, original_context: Optional[str] = None) -> bool:
|
| 172 |
-
"""
|
| 173 |
-
Validate that the prompt properly isolates contexts based on query type
|
| 174 |
-
"""
|
| 175 |
-
if query_type == "selection" and original_context:
|
| 176 |
-
# For selection queries, ensure the prompt focuses on the selected text
|
| 177 |
-
# This is a basic validation - in practice, you'd want more sophisticated checks
|
| 178 |
-
return "SELECTED TEXT" in prompt or original_context[:50] in prompt
|
| 179 |
-
else:
|
| 180 |
-
# For global queries, ensure the prompt uses broader context
|
| 181 |
-
return "CONTEXT:" in prompt
|
| 182 |
-
|
| 183 |
-
return True
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
# Global instance
|
| 187 |
-
prompt_builder = PromptBuilder()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/retrieval/__pycache__/retriever.cpython-312.pyc
DELETED
|
Binary file (6.85 kB)
|
|
|
app/retrieval/__pycache__/vector_search.cpython-312.pyc
DELETED
|
Binary file (4.78 kB)
|
|
|
app/retrieval/retriever.py
DELETED
|
@@ -1,149 +0,0 @@
|
|
| 1 |
-
from typing import List, Dict, Any, Optional
|
| 2 |
-
from app.retrieval.vector_search import VectorSearchEngine
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
class Retriever:
|
| 6 |
-
"""
|
| 7 |
-
High-level retriever that handles the complete retrieval process
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
def __init__(self):
|
| 11 |
-
self.vector_search = VectorSearchEngine()
|
| 12 |
-
|
| 13 |
-
async def retrieve_relevant_documents(
|
| 14 |
-
self,
|
| 15 |
-
query: str,
|
| 16 |
-
top_k: int = 5,
|
| 17 |
-
query_type: str = "global", # "global" or "selection"
|
| 18 |
-
selected_text: Optional[str] = None,
|
| 19 |
-
filters: Optional[Dict] = None
|
| 20 |
-
) -> List[Dict[str, Any]]:
|
| 21 |
-
"""
|
| 22 |
-
Retrieve relevant documents based on the query and query type
|
| 23 |
-
"""
|
| 24 |
-
if query_type == "selection" and selected_text:
|
| 25 |
-
# For selection-based queries, we use the selected text as context
|
| 26 |
-
# but still search for relevant content in the book
|
| 27 |
-
# This approach focuses on content related to the selected text
|
| 28 |
-
search_query = f"{selected_text} {query}".strip()
|
| 29 |
-
else:
|
| 30 |
-
# For global queries, we search with the original query
|
| 31 |
-
search_query = query
|
| 32 |
-
|
| 33 |
-
# Perform the search
|
| 34 |
-
results = await self.vector_search.search_with_query(
|
| 35 |
-
query=search_query,
|
| 36 |
-
top_k=top_k,
|
| 37 |
-
filters=filters
|
| 38 |
-
)
|
| 39 |
-
|
| 40 |
-
# Apply ranking and filtering
|
| 41 |
-
ranked_results = self.vector_search.rank_results_by_relevance(results, query)
|
| 42 |
-
filtered_results = self.vector_search.filter_results(ranked_results, filters)
|
| 43 |
-
|
| 44 |
-
return filtered_results
|
| 45 |
-
|
| 46 |
-
async def retrieve_with_context_filtering(
|
| 47 |
-
self,
|
| 48 |
-
query: str,
|
| 49 |
-
top_k: int = 5,
|
| 50 |
-
query_type: str = "global",
|
| 51 |
-
selected_text: Optional[str] = None
|
| 52 |
-
) -> List[Dict[str, Any]]:
|
| 53 |
-
"""
|
| 54 |
-
Retrieve documents with special handling for different query types
|
| 55 |
-
"""
|
| 56 |
-
if query_type == "selection" and selected_text:
|
| 57 |
-
# For selection-based queries, we might want to prioritize results
|
| 58 |
-
# that are semantically similar to both the selected text and the query
|
| 59 |
-
combined_query = f"Context: {selected_text}\nQuestion: {query}"
|
| 60 |
-
results = await self.vector_search.search_with_query(
|
| 61 |
-
query=combined_query,
|
| 62 |
-
top_k=top_k
|
| 63 |
-
)
|
| 64 |
-
else:
|
| 65 |
-
# For global queries, search normally
|
| 66 |
-
results = await self.vector_search.search_with_query(
|
| 67 |
-
query=query,
|
| 68 |
-
top_k=top_k
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
# Apply additional filtering to ensure results are relevant
|
| 72 |
-
# For selection-based queries, we might want to ensure results are related to the selected text
|
| 73 |
-
if query_type == "selection" and selected_text:
|
| 74 |
-
# Filter results to ensure they're related to the selected text context
|
| 75 |
-
filtered_results = []
|
| 76 |
-
for result in results:
|
| 77 |
-
# This is a simple check - in practice, you might want more sophisticated filtering
|
| 78 |
-
content = result.get('content', '').lower()
|
| 79 |
-
selected_lower = selected_text.lower()
|
| 80 |
-
|
| 81 |
-
# Check if the result content has some relation to the selected text
|
| 82 |
-
# This could be improved with semantic similarity checks
|
| 83 |
-
if any(word in content for word in selected_lower.split()[:5]): # Check first 5 words
|
| 84 |
-
filtered_results.append(result)
|
| 85 |
-
elif len(filtered_results) < top_k: # Add some results even if not perfectly matched
|
| 86 |
-
filtered_results.append(result)
|
| 87 |
-
|
| 88 |
-
results = filtered_results
|
| 89 |
-
|
| 90 |
-
return results
|
| 91 |
-
|
| 92 |
-
async def retrieve_for_citation(self, query: str, top_k: int = 3) -> List[Dict[str, Any]]:
|
| 93 |
-
"""
|
| 94 |
-
Retrieve documents specifically for citation purposes
|
| 95 |
-
This method focuses on getting clean, citable content
|
| 96 |
-
"""
|
| 97 |
-
results = await self.retrieve_relevant_documents(query, top_k)
|
| 98 |
-
|
| 99 |
-
# Format results for citation
|
| 100 |
-
citations = []
|
| 101 |
-
for result in results:
|
| 102 |
-
citation = {
|
| 103 |
-
'document_id': result.get('id'),
|
| 104 |
-
'title': result.get('title'),
|
| 105 |
-
'chapter': result.get('chapter'),
|
| 106 |
-
'section': result.get('section'),
|
| 107 |
-
'page_reference': result.get('page_reference'),
|
| 108 |
-
'content': result.get('content'),
|
| 109 |
-
'relevance_score': result.get('score')
|
| 110 |
-
}
|
| 111 |
-
citations.append(citation)
|
| 112 |
-
|
| 113 |
-
return citations
|
| 114 |
-
|
| 115 |
-
def validate_retrieval_quality(self, results: List[Dict[str, Any]], query: str) -> Dict[str, Any]:
|
| 116 |
-
"""
|
| 117 |
-
Validate the quality of the retrieval results
|
| 118 |
-
"""
|
| 119 |
-
quality_metrics = {
|
| 120 |
-
'total_results': len(results),
|
| 121 |
-
'avg_relevance_score': sum(r.get('score', 0) for r in results) / len(results) if results else 0,
|
| 122 |
-
'has_high_quality_results': any(r.get('score', 0) > 0.7 for r in results) if results else False,
|
| 123 |
-
'query_coverage': self._assess_query_coverage(results, query)
|
| 124 |
-
}
|
| 125 |
-
|
| 126 |
-
return quality_metrics
|
| 127 |
-
|
| 128 |
-
def _assess_query_coverage(self, results: List[Dict[str, Any]], query: str) -> float:
|
| 129 |
-
"""
|
| 130 |
-
Assess how well the results cover the query topics
|
| 131 |
-
This is a simplified implementation
|
| 132 |
-
"""
|
| 133 |
-
if not results:
|
| 134 |
-
return 0.0
|
| 135 |
-
|
| 136 |
-
query_keywords = set(query.lower().split())
|
| 137 |
-
covered_keywords = set()
|
| 138 |
-
|
| 139 |
-
for result in results:
|
| 140 |
-
content = result.get('content', '').lower()
|
| 141 |
-
result_keywords = set(content.split())
|
| 142 |
-
covered_keywords.update(query_keywords.intersection(result_keywords))
|
| 143 |
-
|
| 144 |
-
coverage = len(covered_keywords) / len(query_keywords) if query_keywords else 0.0
|
| 145 |
-
return min(coverage, 1.0) # Ensure value is between 0 and 1
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
# Global instance
|
| 149 |
-
retriever = Retriever()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/retrieval/vector_search.py
DELETED
|
@@ -1,103 +0,0 @@
|
|
| 1 |
-
from typing import List, Dict, Any, Optional
|
| 2 |
-
from app.vector_store.qdrant_client import QdrantVectorStore
|
| 3 |
-
from app.embeddings.minimal_embedding_generator import minimal_embedding_generator
|
| 4 |
-
from app.config import settings
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
class VectorSearchEngine:
|
| 8 |
-
"""
|
| 9 |
-
Core vector search engine that handles semantic search operations
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
def __init__(self):
|
| 13 |
-
self.qdrant_client = QdrantVectorStore()
|
| 14 |
-
self.top_k_default = 5
|
| 15 |
-
|
| 16 |
-
async def search_with_query(self, query: str, top_k: int = 5, filters: Optional[Dict] = None) -> List[Dict[str, Any]]:
|
| 17 |
-
"""
|
| 18 |
-
Perform semantic search using a query string
|
| 19 |
-
"""
|
| 20 |
-
# Generate embedding for the query using minimal generator
|
| 21 |
-
query_embedding = minimal_embedding_generator.encode_query(query)
|
| 22 |
-
|
| 23 |
-
if not query_embedding:
|
| 24 |
-
return []
|
| 25 |
-
|
| 26 |
-
# Perform vector search in Qdrant
|
| 27 |
-
chapter_filter = filters.get('chapter') if filters else None
|
| 28 |
-
search_results = self.qdrant_client.search_similar(
|
| 29 |
-
query_embedding=query_embedding,
|
| 30 |
-
top_k=top_k,
|
| 31 |
-
chapter_filter=chapter_filter
|
| 32 |
-
)
|
| 33 |
-
|
| 34 |
-
return search_results
|
| 35 |
-
|
| 36 |
-
async def search_with_embedding(self, query_embedding: List[float], top_k: int = 5, filters: Optional[Dict] = None) -> List[Dict[str, Any]]:
|
| 37 |
-
"""
|
| 38 |
-
Perform semantic search using a pre-computed embedding
|
| 39 |
-
"""
|
| 40 |
-
chapter_filter = filters.get('chapter') if filters else None
|
| 41 |
-
search_results = self.qdrant_client.search_similar(
|
| 42 |
-
query_embedding=query_embedding,
|
| 43 |
-
top_k=top_k,
|
| 44 |
-
chapter_filter=chapter_filter
|
| 45 |
-
)
|
| 46 |
-
|
| 47 |
-
return search_results
|
| 48 |
-
|
| 49 |
-
def rank_results_by_relevance(self, results: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
|
| 50 |
-
"""
|
| 51 |
-
Apply additional ranking based on relevance to the query
|
| 52 |
-
This is a simple implementation; in production, you might want to use more sophisticated ranking
|
| 53 |
-
"""
|
| 54 |
-
# For now, we'll just return the results as Qdrant already ranks by similarity score
|
| 55 |
-
# In the future, we could implement additional ranking based on:
|
| 56 |
-
# - keyword matching in title/content
|
| 57 |
-
# - recency of content
|
| 58 |
-
# - content length relative to query needs
|
| 59 |
-
return sorted(results, key=lambda x: x.get('score', 0), reverse=True)
|
| 60 |
-
|
| 61 |
-
def filter_results(self, results: List[Dict[str, Any]], filters: Optional[Dict] = None) -> List[Dict[str, Any]]:
|
| 62 |
-
"""
|
| 63 |
-
Apply additional filtering to search results
|
| 64 |
-
"""
|
| 65 |
-
if not filters:
|
| 66 |
-
return results
|
| 67 |
-
|
| 68 |
-
filtered_results = []
|
| 69 |
-
for result in results:
|
| 70 |
-
include = True
|
| 71 |
-
|
| 72 |
-
# Apply content-based filters
|
| 73 |
-
if 'min_score' in filters:
|
| 74 |
-
if result.get('score', 0) < filters['min_score']:
|
| 75 |
-
include = False
|
| 76 |
-
|
| 77 |
-
if 'required_keywords' in filters:
|
| 78 |
-
content = result.get('content', '').lower()
|
| 79 |
-
for keyword in filters['required_keywords']:
|
| 80 |
-
if keyword.lower() not in content:
|
| 81 |
-
include = False
|
| 82 |
-
break
|
| 83 |
-
|
| 84 |
-
if include:
|
| 85 |
-
filtered_results.append(result)
|
| 86 |
-
|
| 87 |
-
return filtered_results
|
| 88 |
-
|
| 89 |
-
async def get_document_content(self, doc_id: str) -> Optional[Dict[str, Any]]:
|
| 90 |
-
"""
|
| 91 |
-
Retrieve content of a specific document by ID
|
| 92 |
-
"""
|
| 93 |
-
return self.qdrant_client.get_document_by_id(doc_id)
|
| 94 |
-
|
| 95 |
-
def get_collection_stats(self) -> Dict[str, Any]:
|
| 96 |
-
"""
|
| 97 |
-
Get statistics about the vector collection
|
| 98 |
-
"""
|
| 99 |
-
return self.qdrant_client.get_collection_info()
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
# Global instance
|
| 103 |
-
vector_search_engine = VectorSearchEngine()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/services/__pycache__/chat_service.cpython-312.pyc
DELETED
|
Binary file (5.69 kB)
|
|
|
app/services/__pycache__/gemini_client.cpython-312.pyc
DELETED
|
Binary file (6.58 kB)
|
|
|
app/services/__pycache__/openrouter_client.cpython-312.pyc
DELETED
|
Binary file (8.86 kB)
|
|
|
app/services/chat_service.py
DELETED
|
@@ -1,144 +0,0 @@
|
|
| 1 |
-
from typing import Dict, Any, List, Optional
|
| 2 |
-
from datetime import datetime
|
| 3 |
-
import uuid
|
| 4 |
-
|
| 5 |
-
from app.retrieval.retriever import Retriever
|
| 6 |
-
from app.prompting.context_filter import ContextFilter
|
| 7 |
-
from app.generation.response_generator import ResponseGenerator
|
| 8 |
-
from app.database.repositories import ChatSessionRepository, ChatMessageRepository, QueryContextRepository
|
| 9 |
-
from app.database.database import get_db
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
class ChatService:
|
| 13 |
-
"""
|
| 14 |
-
Service class that orchestrates the chat functionality
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
def __init__(self):
|
| 18 |
-
self.retriever = Retriever()
|
| 19 |
-
self.context_filter = ContextFilter()
|
| 20 |
-
self.response_generator = ResponseGenerator()
|
| 21 |
-
|
| 22 |
-
async def process_chat_message(
|
| 23 |
-
self,
|
| 24 |
-
session_id: str,
|
| 25 |
-
message: str,
|
| 26 |
-
query_type: str = "global",
|
| 27 |
-
selected_text: Optional[str] = None,
|
| 28 |
-
top_k: int = 5
|
| 29 |
-
) -> Dict[str, Any]:
|
| 30 |
-
"""
|
| 31 |
-
Process a chat message through the full RAG pipeline
|
| 32 |
-
"""
|
| 33 |
-
# Retrieve relevant documents
|
| 34 |
-
retrieved_docs = await self.retriever.retrieve_with_context_filtering(
|
| 35 |
-
query=message,
|
| 36 |
-
top_k=top_k,
|
| 37 |
-
query_type=query_type,
|
| 38 |
-
selected_text=selected_text
|
| 39 |
-
)
|
| 40 |
-
|
| 41 |
-
# Apply context filtering to ensure proper isolation
|
| 42 |
-
filtered_docs = self.context_filter.enforce_context_boundaries(
|
| 43 |
-
contexts=retrieved_docs,
|
| 44 |
-
query_type=query_type,
|
| 45 |
-
selected_text=selected_text
|
| 46 |
-
)
|
| 47 |
-
|
| 48 |
-
# Generate response
|
| 49 |
-
response_data = await self.response_generator.generate_response_with_validation(
|
| 50 |
-
query=message,
|
| 51 |
-
retrieved_contexts=filtered_docs,
|
| 52 |
-
query_type=query_type,
|
| 53 |
-
selected_text=selected_text,
|
| 54 |
-
session_id=session_id
|
| 55 |
-
)
|
| 56 |
-
|
| 57 |
-
# Store conversation in database
|
| 58 |
-
await self._store_conversation(session_id, message, response_data)
|
| 59 |
-
|
| 60 |
-
return response_data
|
| 61 |
-
|
| 62 |
-
async def _store_conversation(self, session_id: str, user_message: str, response_data: Dict[str, Any]):
|
| 63 |
-
"""
|
| 64 |
-
Store the conversation in the database
|
| 65 |
-
"""
|
| 66 |
-
db_gen = get_db()
|
| 67 |
-
db = next(db_gen)
|
| 68 |
-
try:
|
| 69 |
-
# Create or update session
|
| 70 |
-
session_repo = ChatSessionRepository(db)
|
| 71 |
-
existing_session = session_repo.get_session_by_id(session_id)
|
| 72 |
-
if not existing_session:
|
| 73 |
-
session_repo.create_session(session_id=session_id)
|
| 74 |
-
|
| 75 |
-
# Store user message
|
| 76 |
-
user_message_id = f"msg_{uuid.uuid4().hex[:8]}"
|
| 77 |
-
message_repo = ChatMessageRepository(db)
|
| 78 |
-
message_repo.create_message(
|
| 79 |
-
message_id=user_message_id,
|
| 80 |
-
session_id=session_id,
|
| 81 |
-
role="user",
|
| 82 |
-
content=user_message
|
| 83 |
-
)
|
| 84 |
-
|
| 85 |
-
# Store assistant response
|
| 86 |
-
assistant_message_id = f"msg_{uuid.uuid4().hex[:8]}"
|
| 87 |
-
citations_for_storage = response_data.get("citations", [])
|
| 88 |
-
message_repo.create_message(
|
| 89 |
-
message_id=assistant_message_id,
|
| 90 |
-
session_id=session_id,
|
| 91 |
-
role="assistant",
|
| 92 |
-
content=response_data.get("response", ""),
|
| 93 |
-
citations=citations_for_storage
|
| 94 |
-
)
|
| 95 |
-
finally:
|
| 96 |
-
next(db_gen, None) # Close the db session
|
| 97 |
-
|
| 98 |
-
async def get_chat_history(self, session_id: str) -> List[Dict[str, Any]]:
|
| 99 |
-
"""
|
| 100 |
-
Retrieve chat history for a session
|
| 101 |
-
"""
|
| 102 |
-
db_gen = get_db()
|
| 103 |
-
db = next(db_gen)
|
| 104 |
-
try:
|
| 105 |
-
message_repo = ChatMessageRepository(db)
|
| 106 |
-
messages = message_repo.get_messages_by_session(session_id)
|
| 107 |
-
|
| 108 |
-
return [
|
| 109 |
-
{
|
| 110 |
-
"message_id": msg.message_id,
|
| 111 |
-
"role": msg.role,
|
| 112 |
-
"content": msg.content,
|
| 113 |
-
"timestamp": msg.timestamp.isoformat() if msg.timestamp else None,
|
| 114 |
-
"citations": msg.citations
|
| 115 |
-
}
|
| 116 |
-
for msg in messages
|
| 117 |
-
]
|
| 118 |
-
finally:
|
| 119 |
-
next(db_gen, None)
|
| 120 |
-
|
| 121 |
-
def validate_query_params(
|
| 122 |
-
self,
|
| 123 |
-
query_type: str,
|
| 124 |
-
selected_text: Optional[str] = None
|
| 125 |
-
) -> Dict[str, Any]:
|
| 126 |
-
"""
|
| 127 |
-
Validate query parameters
|
| 128 |
-
"""
|
| 129 |
-
errors = []
|
| 130 |
-
|
| 131 |
-
if query_type not in ["global", "selection"]:
|
| 132 |
-
errors.append("query_type must be either 'global' or 'selection'")
|
| 133 |
-
|
| 134 |
-
if query_type == "selection" and not selected_text:
|
| 135 |
-
errors.append("selected_text is required for selection-based queries")
|
| 136 |
-
|
| 137 |
-
return {
|
| 138 |
-
"is_valid": len(errors) == 0,
|
| 139 |
-
"errors": errors
|
| 140 |
-
}
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
# Global instance
|
| 144 |
-
chat_service = ChatService()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/services/openrouter_client.py
DELETED
|
@@ -1,165 +0,0 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
-
import httpx
|
| 3 |
-
from typing import List, Dict, Any, Optional
|
| 4 |
-
from app.config import settings
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
class OpenRouterClient:
|
| 8 |
-
"""
|
| 9 |
-
Client for interacting with OpenRouter API for both embeddings and chat completions
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
def __init__(self):
|
| 13 |
-
self.base_url = settings.OPENROUTER_BASE_URL
|
| 14 |
-
self.api_key = settings.OPENROUTER_API_KEY
|
| 15 |
-
self.max_retries = 3
|
| 16 |
-
self.retry_delay = 1
|
| 17 |
-
|
| 18 |
-
async def generate_embeddings(self, texts: List[str], model: str = "text-embedding-ada-002") -> List[List[float]]:
|
| 19 |
-
"""
|
| 20 |
-
Generate embeddings for a list of texts using OpenRouter API
|
| 21 |
-
"""
|
| 22 |
-
headers = {
|
| 23 |
-
"Authorization": f"Bearer {self.api_key}",
|
| 24 |
-
"Content-Type": "application/json"
|
| 25 |
-
}
|
| 26 |
-
|
| 27 |
-
embeddings = []
|
| 28 |
-
for text in texts:
|
| 29 |
-
# Truncate text if it's too long
|
| 30 |
-
max_length = 8000 # Conservative limit
|
| 31 |
-
if len(text) > max_length:
|
| 32 |
-
text = text[:max_length]
|
| 33 |
-
|
| 34 |
-
data = {
|
| 35 |
-
"model": model,
|
| 36 |
-
"input": text
|
| 37 |
-
}
|
| 38 |
-
|
| 39 |
-
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 40 |
-
for attempt in range(self.max_retries):
|
| 41 |
-
try:
|
| 42 |
-
response = await client.post(
|
| 43 |
-
f"{self.base_url}/embeddings",
|
| 44 |
-
headers=headers,
|
| 45 |
-
json=data
|
| 46 |
-
)
|
| 47 |
-
|
| 48 |
-
if response.status_code == 200:
|
| 49 |
-
result = response.json()
|
| 50 |
-
embedding = result['data'][0]['embedding']
|
| 51 |
-
embeddings.append(embedding)
|
| 52 |
-
break
|
| 53 |
-
elif response.status_code == 429:
|
| 54 |
-
# Rate limited - wait and retry
|
| 55 |
-
wait_time = self.retry_delay * (2 ** attempt) # Exponential backoff
|
| 56 |
-
print(f"Rate limited, waiting {wait_time}s before retry {attempt + 1}")
|
| 57 |
-
await asyncio.sleep(wait_time)
|
| 58 |
-
continue
|
| 59 |
-
else:
|
| 60 |
-
print(f"Error {response.status_code}: {response.text}")
|
| 61 |
-
if attempt == self.max_retries - 1:
|
| 62 |
-
# Last attempt, return zeros as fallback
|
| 63 |
-
embeddings.append([0.0] * 1536)
|
| 64 |
-
break
|
| 65 |
-
|
| 66 |
-
except httpx.RequestError as e:
|
| 67 |
-
print(f"Request error on attempt {attempt + 1}: {str(e)}")
|
| 68 |
-
if attempt == self.max_retries - 1:
|
| 69 |
-
embeddings.append([0.0] * 1536)
|
| 70 |
-
await asyncio.sleep(self.retry_delay * (2 ** attempt))
|
| 71 |
-
except Exception as e:
|
| 72 |
-
print(f"Unexpected error on attempt {attempt + 1}: {str(e)}")
|
| 73 |
-
if attempt == self.max_retries - 1:
|
| 74 |
-
embeddings.append([0.0] * 1536)
|
| 75 |
-
await asyncio.sleep(self.retry_delay * (2 ** attempt))
|
| 76 |
-
|
| 77 |
-
return embeddings
|
| 78 |
-
|
| 79 |
-
async def generate_completion(
|
| 80 |
-
self,
|
| 81 |
-
messages: List[Dict[str, str]],
|
| 82 |
-
model: str = "mistralai/devstral-2512:free",
|
| 83 |
-
temperature: float = 0.7,
|
| 84 |
-
max_tokens: int = 1000
|
| 85 |
-
) -> Optional[str]:
|
| 86 |
-
"""
|
| 87 |
-
Generate completion using OpenRouter API with specified model
|
| 88 |
-
"""
|
| 89 |
-
headers = {
|
| 90 |
-
"Authorization": f"Bearer {self.api_key}",
|
| 91 |
-
"Content-Type": "application/json"
|
| 92 |
-
}
|
| 93 |
-
|
| 94 |
-
data = {
|
| 95 |
-
"model": model,
|
| 96 |
-
"messages": messages,
|
| 97 |
-
"temperature": temperature,
|
| 98 |
-
"max_tokens": max_tokens
|
| 99 |
-
}
|
| 100 |
-
|
| 101 |
-
async with httpx.AsyncClient(timeout=60.0) as client:
|
| 102 |
-
for attempt in range(self.max_retries):
|
| 103 |
-
try:
|
| 104 |
-
response = await client.post(
|
| 105 |
-
f"{self.base_url}/chat/completions",
|
| 106 |
-
headers=headers,
|
| 107 |
-
json=data
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
if response.status_code == 200:
|
| 111 |
-
result = response.json()
|
| 112 |
-
return result['choices'][0]['message']['content']
|
| 113 |
-
elif response.status_code == 429:
|
| 114 |
-
# Rate limited - wait and retry
|
| 115 |
-
wait_time = self.retry_delay * (2 ** attempt) # Exponential backoff
|
| 116 |
-
print(f"Rate limited, waiting {wait_time}s before retry {attempt + 1}")
|
| 117 |
-
await asyncio.sleep(wait_time)
|
| 118 |
-
continue
|
| 119 |
-
else:
|
| 120 |
-
print(f"Error {response.status_code}: {response.text}")
|
| 121 |
-
if attempt == self.max_retries - 1:
|
| 122 |
-
return None
|
| 123 |
-
break
|
| 124 |
-
|
| 125 |
-
except httpx.RequestError as e:
|
| 126 |
-
print(f"Request error on attempt {attempt + 1}: {str(e)}")
|
| 127 |
-
if attempt == self.max_retries - 1:
|
| 128 |
-
return None
|
| 129 |
-
await asyncio.sleep(self.retry_delay * (2 ** attempt))
|
| 130 |
-
except Exception as e:
|
| 131 |
-
print(f"Unexpected error on attempt {attempt + 1}: {str(e)}")
|
| 132 |
-
if attempt == self.max_retries - 1:
|
| 133 |
-
return None
|
| 134 |
-
await asyncio.sleep(self.retry_delay * (2 ** attempt))
|
| 135 |
-
|
| 136 |
-
return None
|
| 137 |
-
|
| 138 |
-
async def get_model_info(self, model: str) -> Optional[Dict[str, Any]]:
|
| 139 |
-
"""
|
| 140 |
-
Get information about a specific model
|
| 141 |
-
"""
|
| 142 |
-
headers = {
|
| 143 |
-
"Authorization": f"Bearer {self.api_key}",
|
| 144 |
-
"Content-Type": "application/json"
|
| 145 |
-
}
|
| 146 |
-
|
| 147 |
-
try:
|
| 148 |
-
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 149 |
-
response = await client.get(
|
| 150 |
-
f"{self.base_url}/models/{model}",
|
| 151 |
-
headers=headers
|
| 152 |
-
)
|
| 153 |
-
|
| 154 |
-
if response.status_code == 200:
|
| 155 |
-
return response.json()
|
| 156 |
-
else:
|
| 157 |
-
print(f"Error getting model info {response.status_code}: {response.text}")
|
| 158 |
-
return None
|
| 159 |
-
except Exception as e:
|
| 160 |
-
print(f"Error getting model info: {str(e)}")
|
| 161 |
-
return None
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
# Global client instance
|
| 165 |
-
openrouter_client = OpenRouterClient()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/vector_store/__pycache__/qdrant_client.cpython-312.pyc
DELETED
|
Binary file (8.25 kB)
|
|
|
app/vector_store/__pycache__/vector_repository.cpython-312.pyc
DELETED
|
Binary file (2.59 kB)
|
|
|
app/vector_store/qdrant_client.py
DELETED
|
@@ -1,207 +0,0 @@
|
|
| 1 |
-
from typing import List, Dict, Any, Optional
|
| 2 |
-
from qdrant_client import QdrantClient
|
| 3 |
-
from qdrant_client.http import models
|
| 4 |
-
from app.config import settings
|
| 5 |
-
from app.ingestion.chunker import TextChunk
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class QdrantVectorStore:
|
| 9 |
-
"""
|
| 10 |
-
Qdrant vector database client for storing and retrieving embeddings
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
def __init__(self):
|
| 14 |
-
try:
|
| 15 |
-
self.client = QdrantClient(
|
| 16 |
-
url=settings.QDRANT_URL,
|
| 17 |
-
api_key=settings.QDRANT_API_KEY,
|
| 18 |
-
prefer_grpc=False, # Using HTTP for better compatibility
|
| 19 |
-
timeout=60.0 # Increase timeout for large batch operations
|
| 20 |
-
)
|
| 21 |
-
self.collection_name = "book_content_chunks"
|
| 22 |
-
self.vector_size = 1536 # Standard embedding size for text-embedding-ada-002
|
| 23 |
-
self._initialize_collection()
|
| 24 |
-
except Exception as e:
|
| 25 |
-
print(f"[WARN] Could not connect to Qdrant: {e}")
|
| 26 |
-
print("[WARN] Qdrant functionality will be unavailable until connection is restored")
|
| 27 |
-
# Initialize with None values when connection fails
|
| 28 |
-
self.client = None
|
| 29 |
-
self.collection_name = "book_content_chunks"
|
| 30 |
-
self.vector_size = 1536
|
| 31 |
-
|
| 32 |
-
def _initialize_collection(self):
|
| 33 |
-
"""
|
| 34 |
-
Initialize the collection if it doesn't exist
|
| 35 |
-
"""
|
| 36 |
-
if self.client is None:
|
| 37 |
-
return # Skip initialization if no client
|
| 38 |
-
|
| 39 |
-
try:
|
| 40 |
-
# Check if collection exists
|
| 41 |
-
self.client.get_collection(self.collection_name)
|
| 42 |
-
print(f"[INFO] Collection '{self.collection_name}' already exists")
|
| 43 |
-
except:
|
| 44 |
-
# Create collection if it doesn't exist
|
| 45 |
-
self.client.create_collection(
|
| 46 |
-
collection_name=self.collection_name,
|
| 47 |
-
vectors_config=models.VectorParams(
|
| 48 |
-
size=self.vector_size,
|
| 49 |
-
distance=models.Distance.COSINE
|
| 50 |
-
)
|
| 51 |
-
)
|
| 52 |
-
print(f"[INFO] Created collection '{self.collection_name}'")
|
| 53 |
-
|
| 54 |
-
# Create payload index for faster filtering
|
| 55 |
-
self.client.create_payload_index(
|
| 56 |
-
collection_name=self.collection_name,
|
| 57 |
-
field_name="chapter",
|
| 58 |
-
field_schema=models.PayloadSchemaType.KEYWORD
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
-
self.client.create_payload_index(
|
| 62 |
-
collection_name=self.collection_name,
|
| 63 |
-
field_name="section",
|
| 64 |
-
field_schema=models.PayloadSchemaType.KEYWORD
|
| 65 |
-
)
|
| 66 |
-
|
| 67 |
-
def store_embeddings(self, chunks_with_embeddings: List[Dict[str, Any]]):
|
| 68 |
-
"""
|
| 69 |
-
Store chunks with their embeddings in Qdrant
|
| 70 |
-
"""
|
| 71 |
-
if self.client is None:
|
| 72 |
-
print("[WARN] Cannot store embeddings - Qdrant not connected")
|
| 73 |
-
return
|
| 74 |
-
|
| 75 |
-
points = []
|
| 76 |
-
for item in chunks_with_embeddings:
|
| 77 |
-
point = models.PointStruct(
|
| 78 |
-
id=item['id'],
|
| 79 |
-
vector=item['embedding'],
|
| 80 |
-
payload={
|
| 81 |
-
'content': item['content'],
|
| 82 |
-
'title': item['title'],
|
| 83 |
-
'chapter': item['chapter'],
|
| 84 |
-
'section': item['section'],
|
| 85 |
-
'page_reference': item['page_reference'],
|
| 86 |
-
'token_count': item['token_count']
|
| 87 |
-
}
|
| 88 |
-
)
|
| 89 |
-
points.append(point)
|
| 90 |
-
|
| 91 |
-
# Upload points in smaller batches to avoid timeouts
|
| 92 |
-
batch_size = 16 # Smaller batch size to avoid timeouts
|
| 93 |
-
for i in range(0, len(points), batch_size):
|
| 94 |
-
batch = points[i:i + batch_size]
|
| 95 |
-
self.client.upsert(
|
| 96 |
-
collection_name=self.collection_name,
|
| 97 |
-
points=batch
|
| 98 |
-
)
|
| 99 |
-
# Add a small delay between batches to avoid overwhelming the server
|
| 100 |
-
import time
|
| 101 |
-
time.sleep(0.1)
|
| 102 |
-
|
| 103 |
-
def search_similar(self, query_embedding: List[float], top_k: int = 5, chapter_filter: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 104 |
-
"""
|
| 105 |
-
Search for similar content based on embedding similarity
|
| 106 |
-
"""
|
| 107 |
-
if self.client is None:
|
| 108 |
-
print("[WARN] Cannot search - Qdrant not connected")
|
| 109 |
-
return []
|
| 110 |
-
|
| 111 |
-
# Build filters if needed
|
| 112 |
-
filters = None
|
| 113 |
-
if chapter_filter:
|
| 114 |
-
filters = models.Filter(
|
| 115 |
-
must=[
|
| 116 |
-
models.FieldCondition(
|
| 117 |
-
key="chapter",
|
| 118 |
-
match=models.MatchValue(value=chapter_filter)
|
| 119 |
-
)
|
| 120 |
-
]
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
# Perform search
|
| 124 |
-
search_results = self.client.search(
|
| 125 |
-
collection_name=self.collection_name,
|
| 126 |
-
query_vector=query_embedding,
|
| 127 |
-
query_filter=filters,
|
| 128 |
-
limit=top_k,
|
| 129 |
-
with_payload=True
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
# Format results
|
| 133 |
-
results = []
|
| 134 |
-
for result in search_results:
|
| 135 |
-
results.append({
|
| 136 |
-
'id': result.id,
|
| 137 |
-
'content': result.payload['content'],
|
| 138 |
-
'title': result.payload['title'],
|
| 139 |
-
'chapter': result.payload['chapter'],
|
| 140 |
-
'section': result.payload['section'],
|
| 141 |
-
'page_reference': result.payload['page_reference'],
|
| 142 |
-
'score': result.score
|
| 143 |
-
})
|
| 144 |
-
|
| 145 |
-
return results
|
| 146 |
-
|
| 147 |
-
def get_document_by_id(self, doc_id: str) -> Optional[Dict[str, Any]]:
|
| 148 |
-
"""
|
| 149 |
-
Retrieve a specific document by its ID
|
| 150 |
-
"""
|
| 151 |
-
if self.client is None:
|
| 152 |
-
print("[WARN] Cannot retrieve document - Qdrant not connected")
|
| 153 |
-
return None
|
| 154 |
-
|
| 155 |
-
points = self.client.retrieve(
|
| 156 |
-
collection_name=self.collection_name,
|
| 157 |
-
ids=[doc_id],
|
| 158 |
-
with_payload=True
|
| 159 |
-
)
|
| 160 |
-
|
| 161 |
-
if points:
|
| 162 |
-
point = points[0]
|
| 163 |
-
return {
|
| 164 |
-
'id': point.id,
|
| 165 |
-
'content': point.payload['content'],
|
| 166 |
-
'title': point.payload['title'],
|
| 167 |
-
'chapter': point.payload['chapter'],
|
| 168 |
-
'section': point.payload['section'],
|
| 169 |
-
'page_reference': point.payload['page_reference']
|
| 170 |
-
}
|
| 171 |
-
|
| 172 |
-
return None
|
| 173 |
-
|
| 174 |
-
def delete_collection(self):
|
| 175 |
-
"""
|
| 176 |
-
Delete the entire collection (use with caution!)
|
| 177 |
-
"""
|
| 178 |
-
if self.client is None:
|
| 179 |
-
print("[WARN] Cannot delete collection - Qdrant not connected")
|
| 180 |
-
return
|
| 181 |
-
|
| 182 |
-
self.client.delete_collection(self.collection_name)
|
| 183 |
-
|
| 184 |
-
def get_collection_info(self) -> Dict[str, Any]:
|
| 185 |
-
"""
|
| 186 |
-
Get information about the collection
|
| 187 |
-
"""
|
| 188 |
-
if self.client is None:
|
| 189 |
-
print("[WARN] Cannot get collection info - Qdrant not connected")
|
| 190 |
-
return {
|
| 191 |
-
'name': self.collection_name,
|
| 192 |
-
'vector_size': self.vector_size,
|
| 193 |
-
'distance': 'COSINE',
|
| 194 |
-
'point_count': 0
|
| 195 |
-
}
|
| 196 |
-
|
| 197 |
-
info = self.client.get_collection(self.collection_name)
|
| 198 |
-
return {
|
| 199 |
-
'name': self.collection_name,
|
| 200 |
-
'vector_size': info.config.params.vectors.size,
|
| 201 |
-
'distance': info.config.params.vectors.distance,
|
| 202 |
-
'point_count': info.points_count
|
| 203 |
-
}
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
# Global instance
|
| 207 |
-
qdrant_client = QdrantVectorStore()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/vector_store/vector_repository.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
| 1 |
-
from typing import List, Dict, Any, Optional
|
| 2 |
-
from app.vector_store.qdrant_client import QdrantVectorStore, qdrant_client
|
| 3 |
-
from app.ingestion.chunker import TextChunk
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
class VectorRepository:
|
| 7 |
-
"""
|
| 8 |
-
Repository class for vector store operations
|
| 9 |
-
"""
|
| 10 |
-
|
| 11 |
-
def __init__(self, vector_store: QdrantVectorStore):
|
| 12 |
-
self.vector_store = vector_store
|
| 13 |
-
|
| 14 |
-
def store_document_chunks(self, chunks_with_embeddings: List[Dict[str, Any]]):
|
| 15 |
-
"""
|
| 16 |
-
Store document chunks with embeddings in the vector store
|
| 17 |
-
"""
|
| 18 |
-
self.vector_store.store_embeddings(chunks_with_embeddings)
|
| 19 |
-
|
| 20 |
-
def search_relevant_chunks(
|
| 21 |
-
self,
|
| 22 |
-
query_embedding: List[float],
|
| 23 |
-
top_k: int = 5,
|
| 24 |
-
chapter_filter: Optional[str] = None
|
| 25 |
-
) -> List[Dict[str, Any]]:
|
| 26 |
-
"""
|
| 27 |
-
Search for relevant chunks based on query embedding
|
| 28 |
-
"""
|
| 29 |
-
return self.vector_store.search_similar(
|
| 30 |
-
query_embedding=query_embedding,
|
| 31 |
-
top_k=top_k,
|
| 32 |
-
chapter_filter=chapter_filter
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
-
def get_document_by_id(self, doc_id: str) -> Optional[Dict[str, Any]]:
|
| 36 |
-
"""
|
| 37 |
-
Retrieve a document by its ID
|
| 38 |
-
"""
|
| 39 |
-
return self.vector_store.get_document_by_id(doc_id)
|
| 40 |
-
|
| 41 |
-
def get_collection_stats(self) -> Dict[str, Any]:
|
| 42 |
-
"""
|
| 43 |
-
Get statistics about the vector collection
|
| 44 |
-
"""
|
| 45 |
-
return self.vector_store.get_collection_info()
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
# Global instance
|
| 49 |
-
vector_repository = VectorRepository(qdrant_client)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|