File size: 14,747 Bytes
fef6ed9 fc6a53f fef6ed9 fc6a53f 4c6aa01 fef6ed9 fc6a53f fef6ed9 fc6a53f 4c6aa01 fef6ed9 fc6a53f 4c6aa01 fef6ed9 fc6a53f fef6ed9 4c6aa01 fc6a53f fef6ed9 fc6a53f 4c6aa01 fc6a53f fef6ed9 4c6aa01 fc6a53f fef6ed9 fc6a53f 4c6aa01 fc6a53f 4c6aa01 fc6a53f fef6ed9 4c6aa01 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fef0353 fef6ed9 fc6a53f fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fc6a53f fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fc6a53f fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fc6a53f fef6ed9 fc6a53f fef6ed9 fc6a53f 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 fc6a53f 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 4c6aa01 fef6ed9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 |
# rag_optimized.py - Performance-Optimized RAG System
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from typing import List, Dict, Any, Tuple, Optional
import faiss
import hashlib
from tqdm import tqdm
from groq import Groq
import re
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from collections import defaultdict
import spacy
from rank_bm25 import BM25Okapi
import asyncio
import time
from concurrent.futures import ThreadPoolExecutor
import logging
# Configure logging
logger = logging.getLogger(__name__)
# Global model instances (shared across sessions)
_SHARED_MODEL = None
_SHARED_TOKENIZER = None
_SHARED_NLP_MODEL = None
_DEVICE = None
_THREAD_POOL = None
# Legal knowledge base (optimized)
LEGAL_CONCEPTS = {
'liability': ['negligence', 'strict liability', 'vicarious liability', 'product liability'],
'contract': ['breach', 'consideration', 'offer', 'acceptance', 'damages', 'specific performance'],
'criminal': ['mens rea', 'actus reus', 'intent', 'malice', 'premeditation'],
'procedure': ['jurisdiction', 'standing', 'statute of limitations', 'res judicata'],
'evidence': ['hearsay', 'relevance', 'privilege', 'burden of proof', 'admissibility'],
'constitutional': ['due process', 'equal protection', 'free speech', 'search and seizure']
}
QUERY_PATTERNS = {
'precedent': ['case', 'precedent', 'ruling', 'held', 'decision'],
'statute_interpretation': ['statute', 'section', 'interpretation', 'meaning', 'definition'],
'factual': ['what happened', 'facts', 'circumstances', 'events'],
'procedure': ['how to', 'procedure', 'process', 'filing', 'requirements']
}
def initialize_models(model_id: str, groq_api_key: str = None):
"""Initialize shared models (call once at startup)"""
global _SHARED_MODEL, _SHARED_TOKENIZER, _SHARED_NLP_MODEL, _DEVICE, _THREAD_POOL
try:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
except:
pass
_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"Using device: {_DEVICE}")
logger.info(f"Loading model: {model_id}")
_SHARED_TOKENIZER = AutoTokenizer.from_pretrained(model_id)
_SHARED_MODEL = AutoModel.from_pretrained(model_id).to(_DEVICE)
_SHARED_MODEL.eval()
# Initialize thread pool for CPU-bound operations
_THREAD_POOL = ThreadPoolExecutor(max_workers=4)
try:
_SHARED_NLP_MODEL = spacy.load("en_core_web_sm")
except:
logger.warning("SpaCy model not found, using basic NER")
_SHARED_NLP_MODEL = None
class OptimizedSessionRAG:
"""High-performance session-specific RAG instance that loads pre-computed embeddings"""
def __init__(self, session_id: str, groq_api_key: str = None):
self.session_id = session_id
self.groq_client = Groq(api_key=groq_api_key) if groq_api_key else None
# Session-specific indices and data
self.dense_index = None
self.bm25_index = None
self.token_to_chunks = None
self.chunks_data = []
# Performance tracking
self.load_time = None
self.index_build_time = None
# Verify shared models are initialized
if _SHARED_MODEL is None or _SHARED_TOKENIZER is None:
raise ValueError("Models not initialized. Call initialize_models() first.")
def load_existing_session_data(self, chunks_from_db: List[Dict[str, Any]]):
"""OPTIMIZED: Load pre-existing chunks with embeddings from database - NO EMBEDDING CREATION"""
start_time = time.time()
logger.info(f"Loading existing session data for {self.session_id}: {len(chunks_from_db)} chunks...")
# Process chunks from MongoDB format - DIRECT LOADING, NO EMBEDDING COMPUTATION
self.chunks_data = self._process_db_chunks_fast(chunks_from_db)
# Rebuild indices from existing embeddings ONLY
self._rebuild_indices_from_precomputed_embeddings()
self.load_time = time.time() - start_time
logger.info(f"Session {self.session_id} loaded in {self.load_time:.2f}s with PRE-COMPUTED embeddings!")
def _process_db_chunks_fast(self, chunks_from_db: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""FAST: Convert MongoDB chunk format to internal format without any computation"""
processed_chunks = []
for chunk in chunks_from_db:
# Convert embedding from list to numpy array if needed - NO COMPUTATION
embedding = chunk.get('embedding')
if embedding is None:
raise ValueError(f"Missing embedding for chunk {chunk.get('chunk_id', 'unknown')}")
if isinstance(embedding, list):
embedding = np.array(embedding, dtype=np.float32)
processed_chunk = {
'id': chunk.get('chunk_id', chunk.get('id')),
'text': chunk.get('content', chunk.get('text', '')),
'title': chunk.get('title', 'Document'),
'section_type': chunk.get('section_type', 'general'),
'importance_score': chunk.get('importance_score', 1.0),
'entities': chunk.get('entities', []),
'embedding': embedding # PRE-COMPUTED, NO CREATION
}
processed_chunks.append(processed_chunk)
return processed_chunks
def _rebuild_indices_from_precomputed_embeddings(self):
"""OPTIMIZED: Rebuild search indices using ONLY pre-computed embeddings from database"""
if not self.chunks_data:
raise ValueError("No chunks data available")
start_time = time.time()
logger.info(f"Rebuilding indices from {len(self.chunks_data)} pre-computed embeddings...")
# 1. Build FAISS index from existing embeddings - NO EMBEDDING COMPUTATION
embeddings = []
for chunk in self.chunks_data:
if chunk['embedding'] is None:
raise ValueError(f"Missing embedding for chunk {chunk.get('id', 'unknown')}")
embeddings.append(chunk['embedding'])
# Stack embeddings efficiently
embeddings_matrix = np.vstack(embeddings).astype('float32')
logger.info(f"Built embeddings matrix: {embeddings_matrix.shape}")
# Build FAISS index
self.dense_index = faiss.IndexFlatIP(embeddings_matrix.shape[1])
self.dense_index.add(embeddings_matrix)
# 2. Build BM25 index efficiently
tokenized_corpus = [chunk['text'].lower().split() for chunk in self.chunks_data]
self.bm25_index = BM25Okapi(tokenized_corpus)
# 3. Build token-to-chunk mapping efficiently
self.token_to_chunks = defaultdict(set)
for i, chunk in enumerate(self.chunks_data):
tokens = chunk['text'].lower().split()
for token in tokens:
self.token_to_chunks[token].add(i)
self.index_build_time = time.time() - start_time
logger.info(f"All indices rebuilt in {self.index_build_time:.2f}s from pre-computed embeddings!")
def create_embedding(self, text: str) -> np.ndarray:
"""Create embedding for query (ONLY used for new queries, not document loading)"""
inputs = _SHARED_TOKENIZER(text, padding=True, truncation=True,
max_length=512, return_tensors='pt').to(_DEVICE)
with torch.no_grad():
outputs = _SHARED_MODEL(**inputs)
attention_mask = inputs['attention_mask']
token_embeddings = outputs.last_hidden_state
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# Normalize embeddings
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
return embeddings.cpu().numpy()[0].astype('float32')
def analyze_query_fast(self, query: str) -> Dict[str, Any]:
"""FAST query analysis - minimal processing"""
query_lower = query.lower()
# Quick query type classification
query_type = 'general'
for qtype, patterns in QUERY_PATTERNS.items():
if any(pattern in query_lower for pattern in patterns):
query_type = qtype
break
# Extract key concepts quickly
key_concepts = []
for concept_category, concepts in LEGAL_CONCEPTS.items():
for concept in concepts:
if concept in query_lower:
key_concepts.append(concept)
# Simple query expansion
expanded_queries = [query]
if key_concepts:
expanded_queries.append(f"{query} {' '.join(key_concepts[:2])}")
return {
'original_query': query,
'query_type': query_type,
'key_concepts': key_concepts,
'expanded_queries': expanded_queries[:2] # Limit to 2 for speed
}
def fast_retrieval(self, query_analysis: Dict[str, Any], top_k: int = 10) -> List[Tuple[Dict[str, Any], float]]:
"""OPTIMIZED: Fast multi-stage retrieval with minimal overhead"""
candidates = {}
# Stage 1: Dense retrieval with primary query only
query = query_analysis['original_query']
query_emb = self.create_embedding(query)
scores, indices = self.dense_index.search(
query_emb.reshape(1, -1),
min(top_k * 2, len(self.chunks_data))
)
for idx, score in zip(indices[0], scores[0]):
if idx < len(self.chunks_data):
chunk = self.chunks_data[idx]
chunk_id = chunk['id']
candidates[chunk_id] = {
'chunk': chunk,
'score': float(score) * chunk['importance_score']
}
# Stage 2: BM25 boost for top candidates
if len(candidates) < top_k:
query_tokens = query.lower().split()
bm25_scores = self.bm25_index.get_scores(query_tokens)
top_bm25_indices = np.argsort(bm25_scores)[-top_k:][::-1]
for idx in top_bm25_indices:
if idx < len(self.chunks_data):
chunk = self.chunks_data[idx]
chunk_id = chunk['id']
if chunk_id not in candidates:
candidates[chunk_id] = {
'chunk': chunk,
'score': float(bm25_scores[idx]) * 0.3 # Lower weight for BM25
}
else:
candidates[chunk_id]['score'] += float(bm25_scores[idx]) * 0.2
# Convert to list and sort
final_scores = [(data['chunk'], data['score']) for data in candidates.values()]
final_scores.sort(key=lambda x: x[1], reverse=True)
return final_scores[:top_k]
def generate_fast_answer(self, query: str, retrieved_chunks: List[Tuple[Dict[str, Any], float]]) -> Dict[str, Any]:
"""Generate answer with minimal overhead"""
if not self.groq_client:
return {'error': 'Groq client not initialized'}
# Prepare context efficiently
context_parts = []
for i, (chunk, score) in enumerate(retrieved_chunks[:3], 1): # Limit to top 3 for speed
context_parts.append(f"""
Document {i} - Relevance: {score:.2f}
{chunk['text'][:600]}
""")
context = "\n---\n".join(context_parts)
system_prompt = """You are a legal AI assistant. Provide concise, accurate answers based ONLY on the provided documents. If information isn't in the documents, state that clearly."""
user_prompt = f"""Query: {query}
Documents:
{context}
Provide a clear, concise answer based on the documents."""
try:
response = self.groq_client.chat.completions.create(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
model="llama-3.1-8b-instant",
temperature=0.1,
max_tokens=500 # Limit for speed
)
answer = response.choices[0].message.content
# Calculate confidence
avg_score = sum(score for _, score in retrieved_chunks[:3]) / min(3, len(retrieved_chunks))
confidence = min(avg_score * 100, 100)
return {
'answer': answer,
'confidence': confidence,
'sources': [
{
'chunk_id': chunk['id'],
'title': chunk['title'],
'section': chunk['section_type'],
'relevance_score': float(score),
'text_preview': chunk['text'][:200] + '...',
'entities': [e['text'] for e in chunk['entities'][:3]]
}
for chunk, score in retrieved_chunks[:5]
]
}
except Exception as e:
return {'error': f'Error generating answer: {str(e)}'}
def query_documents(self, query: str, top_k: int = 5) -> Dict[str, Any]:
"""OPTIMIZED: Main query function with minimal processing time"""
if not self.chunks_data:
return {'error': f'No documents indexed for session {self.session_id}'}
start_time = time.time()
# Fast query analysis
query_analysis = self.analyze_query_fast(query)
# Fast retrieval
retrieved_chunks = self.fast_retrieval(query_analysis, top_k)
if not retrieved_chunks:
return {
'error': 'No relevant documents found',
'query_analysis': query_analysis
}
# Generate answer
result = self.generate_fast_answer(query, retrieved_chunks)
result['query_analysis'] = query_analysis
result['processing_time'] = time.time() - start_time
logger.info(f"Query processed in {result['processing_time']:.2f}s")
return result
# For backward compatibility - replace SessionRAG with OptimizedSessionRAG
SessionRAG = OptimizedSessionRAG |