Spaces:

kn29
/

doc-processor

Runtime error

App Files Files Community

Kartik Narang commited on Sep 18, 2025

Commit

3cfeab7

0 Parent(s):

first clean commit

Browse files

Files changed (5) hide show

app.py +594 -0
requirements.txt +33 -0
simple/ner.py +159 -0
simple/rag.py +593 -0
simple/summarizer.py +187 -0

app.py ADDED Viewed

	@@ -0,0 +1,594 @@

+import os
+import asyncio
+import uuid
+from datetime import datetime, timedelta
+from typing import Dict, Any, List, Optional
+import logging
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+import uvicorn
+from motor.motor_asyncio import AsyncIOMotorClient
+import pymongo
+from pymongo import ASCENDING
+import PyPDF2
+import docx
+import io
+from PIL import Image
+import pytesseract
+# Import our models
+from simple.rag import initialize_models, process_documents, create_embedding, chunk_text_hierarchical
+from simple.ner import extract_legal_entities
+from simple.summarizer import summarize_legal_document
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global variables
+mongodb_client: Optional[AsyncIOMotorClient] = None
+db = None
+cleanup_task = None
+# Configuration
+MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://username:password@cluster.mongodb.net/")
+DATABASE_NAME = os.getenv("DATABASE_NAME", "legal_rag_system")
+HF_MODEL_ID = os.getenv("HF_MODEL_ID", "sentence-transformers/all-MiniLM-L6-v2")
+GROQ_API_KEY = os.getenv("GROQ_API_KEY", None)
+SESSION_EXPIRE_HOURS = int(os.getenv("SESSION_EXPIRE_HOURS", "24"))
+# Supported file types
+SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.docx', '.doc'}
+MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan manager"""
+    # Startup
+    await startup_event()
+    yield
+    # Shutdown
+    await shutdown_event()
+app = FastAPI(
+    title="Legal Document Processor",
+    description="Process legal documents with NER, summarization, and embeddings",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure this properly for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+async def startup_event():
+    """Initialize services on startup"""
+    global mongodb_client, db, cleanup_task
+    try:
+        logger.info("🚀 Starting up Legal Document Processor...")
+        # Initialize MongoDB
+        logger.info("📊 Connecting to MongoDB...")
+        mongodb_client = AsyncIOMotorClient(MONGODB_URI)
+        db = mongodb_client[DATABASE_NAME]
+        # Test connection
+        await mongodb_client.admin.command('ping')
+        logger.info("✅ MongoDB connected successfully")
+        # Create indexes
+        await create_indexes()
+        # Initialize ML models
+        logger.info("🤖 Loading ML models...")
+        initialize_models(HF_MODEL_ID, GROQ_API_KEY)
+        logger.info("✅ Models loaded successfully")
+        # Start cleanup task
+        cleanup_task = asyncio.create_task(periodic_cleanup())
+        logger.info("🧹 Cleanup task started")
+        logger.info("🎉 Startup completed successfully!")
+    except Exception as e:
+        logger.error(f"❌ Startup failed: {str(e)}")
+        raise
+async def shutdown_event():
+    """Cleanup on shutdown"""
+    global mongodb_client, cleanup_task
+    logger.info("🛑 Shutting down...")
+    if cleanup_task:
+        cleanup_task.cancel()
+        try:
+            await cleanup_task
+        except asyncio.CancelledError:
+            pass
+    if mongodb_client:
+        mongodb_client.close()
+    logger.info("✅ Shutdown completed")
+async def create_indexes():
+    """Create MongoDB indexes for optimal performance"""
+    try:
+        # Sessions collection indexes
+        await db.sessions.create_index([("session_id", ASCENDING)], unique=True)
+        await db.sessions.create_index([("created_at", ASCENDING)], expireAfterSeconds=SESSION_EXPIRE_HOURS * 3600)
+        await db.sessions.create_index([("status", ASCENDING)])
+        # Chunks collection indexes
+        await db.chunks.create_index([("session_id", ASCENDING)])
+        await db.chunks.create_index([("chunk_id", ASCENDING)])
+        await db.chunks.create_index([("created_at", ASCENDING)], expireAfterSeconds=SESSION_EXPIRE_HOURS * 3600)
+        # NER results collection indexes
+        await db.ner_results.create_index([("session_id", ASCENDING)])
+        await db.ner_results.create_index([("created_at", ASCENDING)], expireAfterSeconds=SESSION_EXPIRE_HOURS * 3600)
+        # Summaries collection indexes
+        await db.summaries.create_index([("session_id", ASCENDING)])
+        await db.summaries.create_index([("created_at", ASCENDING)], expireAfterSeconds=SESSION_EXPIRE_HOURS * 3600)
+        logger.info("📊 Database indexes created successfully")
+    except Exception as e:
+        logger.error(f"❌ Failed to create indexes: {str(e)}")
+async def periodic_cleanup():
+    """Periodically clean up expired sessions"""
+    while True:
+        try:
+            await asyncio.sleep(3600)  # Run every hour
+            await cleanup_expired_sessions()
+        except asyncio.CancelledError:
+            break
+        except Exception as e:
+            logger.error(f"❌ Cleanup task error: {str(e)}")
+async def cleanup_expired_sessions():
+    """Clean up expired sessions from MongoDB"""
+    try:
+        cutoff_time = datetime.utcnow() - timedelta(hours=SESSION_EXPIRE_HOURS)
+        # Count expired sessions
+        expired_count = await db.sessions.count_documents({
+            "created_at": {"$lt": cutoff_time}
+        })
+        if expired_count > 0:
+            # Delete expired sessions and related data
+            await db.sessions.delete_many({"created_at": {"$lt": cutoff_time}})
+            await db.chunks.delete_many({"created_at": {"$lt": cutoff_time}})
+            await db.ner_results.delete_many({"created_at": {"$lt": cutoff_time}})
+            await db.summaries.delete_many({"created_at": {"$lt": cutoff_time}})
+            logger.info(f"🧹 Cleaned up {expired_count} expired sessions")
+    except Exception as e:
+        logger.error(f"❌ Cleanup failed: {str(e)}")
+def extract_text_from_file(file_content: bytes, filename: str) -> str:
+    """Extract text from various file formats"""
+    file_ext = os.path.splitext(filename.lower())[1]
+    try:
+        if file_ext == '.pdf':
+            return extract_text_from_pdf(file_content)
+        elif file_ext == '.txt':
+            return file_content.decode('utf-8', errors='ignore')
+        elif file_ext in ['.docx', '.doc']:
+            return extract_text_from_docx(file_content)
+        else:
+            raise ValueError(f"Unsupported file type: {file_ext}")
+    except Exception as e:
+        logger.error(f"❌ Text extraction failed for {filename}: {str(e)}")
+        raise
+def extract_text_from_pdf(file_content: bytes) -> str:
+    """Extract text from PDF file"""
+    try:
+        pdf_file = io.BytesIO(file_content)
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text() + "\n"
+        if not text.strip():
+            # Try OCR if no text extracted
+            logger.info("📷 No text found in PDF, attempting OCR...")
+            # This would require additional setup for OCR
+            text = "OCR extraction not implemented yet"
+        return text
+    except Exception as e:
+        logger.error(f"❌ PDF extraction failed: {str(e)}")
+        raise
+def extract_text_from_docx(file_content: bytes) -> str:
+    """Extract text from DOCX file"""
+    try:
+        doc_file = io.BytesIO(file_content)
+        doc = docx.Document(doc_file)
+        text = ""
+        for paragraph in doc.paragraphs:
+            text += paragraph.text + "\n"
+        return text
+    except Exception as e:
+        logger.error(f"❌ DOCX extraction failed: {str(e)}")
+        raise
+async def process_document_pipeline(
+    session_id: str,
+    text: str,
+    filename: str,
+    background_tasks: BackgroundTasks
+):
+    """Process document through the complete pipeline"""
+    try:
+        logger.info(f"🔄 Starting processing pipeline for session {session_id}")
+        # Update session status
+        await db.sessions.update_one(
+            {"session_id": session_id},
+            {"$set": {"status": "processing", "updated_at": datetime.utcnow()}}
+        )
+        # Step 1: NER Processing
+        logger.info(f"🔍 Running NER for session {session_id}")
+        ner_results = extract_legal_entities(text)
+        # Store NER results
+        await db.ner_results.insert_one({
+            "session_id": session_id,
+            "filename": filename,
+            "results": ner_results,
+            "created_at": datetime.utcnow()
+        })
+        # Step 2: Summarization
+        logger.info(f"📄 Running summarization for session {session_id}")
+        summary_results = summarize_legal_document(
+            text,
+            max_sentences=5,
+            groq_api_key=GROQ_API_KEY
+        )
+        # Store summary results
+        await db.summaries.insert_one({
+            "session_id": session_id,
+            "filename": filename,
+            "results": summary_results,
+            "created_at": datetime.utcnow()
+        })
+        # Step 3: Chunking and Embedding
+        logger.info(f"🧩 Creating chunks and embeddings for session {session_id}")
+        chunks = chunk_text_hierarchical(text, filename)
+        # Create embeddings and store chunks
+        chunks_to_store = []
+        for chunk in chunks:
+            # Create embedding
+            embedding = create_embedding(chunk['text'])
+            chunk_doc = {
+                "session_id": session_id,
+                "chunk_id": chunk['id'],
+                "text": chunk['text'],
+                "title": chunk['title'],
+                "section_type": chunk['section_type'],
+                "importance_score": chunk['importance_score'],
+                "entities": chunk['entities'],
+                "embedding": embedding.tolist(),  # Convert numpy array to list
+                "created_at": datetime.utcnow()
+            }
+            chunks_to_store.append(chunk_doc)
+        # Batch insert chunks
+        if chunks_to_store:
+            await db.chunks.insert_many(chunks_to_store)
+        # Update session as completed
+        await db.sessions.update_one(
+            {"session_id": session_id},
+            {
+                "$set": {
+                    "status": "completed",
+                    "updated_at": datetime.utcnow(),
+                    "chunk_count": len(chunks_to_store),
+                    "processing_completed_at": datetime.utcnow()
+                }
+            }
+        )
+        logger.info(f"✅ Processing completed for session {session_id}")
+    except Exception as e:
+        logger.error(f"❌ Processing failed for session {session_id}: {str(e)}")
+        # Update session with error
+        await db.sessions.update_one(
+            {"session_id": session_id},
+            {
+                "$set": {
+                    "status": "failed",
+                    "error": str(e),
+                    "updated_at": datetime.utcnow()
+                }
+            }
+        )
+@app.post("/upload")
+async def upload_document(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(...)
+):
+    """Upload and process a legal document"""
+    try:
+        # Validate file
+        if not file.filename:
+            raise HTTPException(status_code=400, detail="No file provided")
+        file_ext = os.path.splitext(file.filename.lower())[1]
+        if file_ext not in SUPPORTED_EXTENSIONS:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported file type. Supported: {', '.join(SUPPORTED_EXTENSIONS)}"
+            )
+        # Check file size
+        file_content = await file.read()
+        if len(file_content) > MAX_FILE_SIZE:
+            raise HTTPException(
+                status_code=400,
+                detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
+            )
+        # Generate session ID
+        session_id = str(uuid.uuid4())
+        # Extract text
+        logger.info(f"📄 Extracting text from {file.filename}")
+        text = extract_text_from_file(file_content, file.filename)
+        if not text.strip():
+            raise HTTPException(status_code=400, detail="No text could be extracted from the file")
+        # Create session record
+        session_doc = {
+            "session_id": session_id,
+            "filename": file.filename,
+            "file_size": len(file_content),
+            "text_length": len(text),
+            "word_count": len(text.split()),
+            "status": "uploaded",
+            "created_at": datetime.utcnow(),
+            "updated_at": datetime.utcnow()
+        }
+        await db.sessions.insert_one(session_doc)
+        # Start background processing
+        background_tasks.add_task(
+            process_document_pipeline,
+            session_id,
+            text,
+            file.filename,
+            background_tasks
+        )
+        logger.info(f"✅ Document uploaded successfully. Session ID: {session_id}")
+        return JSONResponse(
+            status_code=200,
+            content={
+                "success": True,
+                "session_id": session_id,
+                "filename": file.filename,
+                "file_size": len(file_content),
+                "text_length": len(text),
+                "word_count": len(text.split()),
+                "status": "processing",
+                "message": "Document uploaded successfully. Processing started."
+            }
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Upload failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}")
+@app.get("/status/{session_id}")
+async def get_session_status(session_id: str):
+    """Get the processing status of a session"""
+    try:
+        session = await db.sessions.find_one({"session_id": session_id})
+        if not session:
+            raise HTTPException(status_code=404, detail="Session not found")
+        # Convert ObjectId to string for JSON serialization
+        session["_id"] = str(session["_id"])
+        # Add processing progress info
+        if session["status"] == "completed":
+            # Get additional info
+            ner_result = await db.ner_results.find_one({"session_id": session_id})
+            summary_result = await db.summaries.find_one({"session_id": session_id})
+            chunk_count = await db.chunks.count_documents({"session_id": session_id})
+            session["ner_entities"] = ner_result["results"]["total_entities"] if ner_result else 0
+            session["summary_available"] = bool(summary_result)
+            session["chunk_count"] = chunk_count
+        return JSONResponse(
+            status_code=200,
+            content={
+                "success": True,
+                "session": session
+            }
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Status check failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Status check failed: {str(e)}")
+@app.get("/results/{session_id}")
+async def get_processing_results(session_id: str):
+    """Get all processing results for a session"""
+    try:
+        # Check if session exists and is completed
+        session = await db.sessions.find_one({"session_id": session_id})
+        if not session:
+            raise HTTPException(status_code=404, detail="Session not found")
+        if session["status"] != "completed":
+            return JSONResponse(
+                status_code=202,
+                content={
+                    "success": False,
+                    "message": f"Processing not completed. Current status: {session['status']}"
+                }
+            )
+        # Get NER results
+        ner_result = await db.ner_results.find_one({"session_id": session_id})
+        # Get summary results
+        summary_result = await db.summaries.find_one({"session_id": session_id})
+        # Get chunk metadata (not full text)
+        chunks_cursor = db.chunks.find(
+            {"session_id": session_id},
+            {"text": 0, "embedding": 0}  # Exclude large fields
+        )
+        chunks_metadata = await chunks_cursor.to_list(length=None)
+        # Clean up ObjectIds
+        for chunk in chunks_metadata:
+            chunk["_id"] = str(chunk["_id"])
+        return JSONResponse(
+            status_code=200,
+            content={
+                "success": True,
+                "session_id": session_id,
+                "filename": session["filename"],
+                "ner_results": ner_result["results"] if ner_result else None,
+                "summary_results": summary_result["results"] if summary_result else None,
+                "chunks_metadata": {
+                    "total_chunks": len(chunks_metadata),
+                    "chunks": chunks_metadata[:10]  # Return first 10 chunks metadata
+                },
+                "processing_completed_at": session.get("processing_completed_at")
+            }
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Results retrieval failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Results retrieval failed: {str(e)}")
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    try:
+        # Test MongoDB connection
+        await mongodb_client.admin.command('ping')
+        return JSONResponse(
+            status_code=200,
+            content={
+                "status": "healthy",
+                "timestamp": datetime.utcnow().isoformat(),
+                "services": {
+                    "mongodb": "connected",
+                    "ml_models": "loaded"
+                }
+            }
+        )
+    except Exception as e:
+        logger.error(f"❌ Health check failed: {str(e)}")
+        return JSONResponse(
+            status_code=503,
+            content={
+                "status": "unhealthy",
+                "error": str(e),
+                "timestamp": datetime.utcnow().isoformat()
+            }
+        )
+@app.delete("/session/{session_id}")
+async def delete_session(session_id: str):
+    """Manually delete a session and all related data"""
+    try:
+        # Delete from all collections
+        session_result = await db.sessions.delete_one({"session_id": session_id})
+        await db.chunks.delete_many({"session_id": session_id})
+        await db.ner_results.delete_many({"session_id": session_id})
+        await db.summaries.delete_many({"session_id": session_id})
+        if session_result.deleted_count == 0:
+            raise HTTPException(status_code=404, detail="Session not found")
+        return JSONResponse(
+            status_code=200,
+            content={
+                "success": True,
+                "message": f"Session {session_id} deleted successfully"
+            }
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Session deletion failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Session deletion failed: {str(e)}")
+@app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "service": "Legal Document Processor",
+        "version": "1.0.0",
+        "status": "running",
+        "endpoints": {
+            "upload": "POST /upload - Upload a legal document for processing",
+            "status": "GET /status/{session_id} - Check processing status",
+            "results": "GET /results/{session_id} - Get processing results",
+            "health": "GET /health - Health check",
+            "delete": "DELETE /session/{session_id} - Delete a session"
+        },
+        "supported_formats": list(SUPPORTED_EXTENSIONS)
+    }
+if __name__ == "__main__":
+    port = int(os.getenv("PORT", 7860))
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=port,
+        reload=False,
+        access_log=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+# Hugging Face Spaces requirements
+gradio==4.44.0
+requests==2.31.0
+fastapi==0.115.6
+uvicorn==0.32.1
+python-multipart==0.0.9   # ✅ needed for FastAPI file uploads
+# Core ML/NLP
+torch==2.2.2
+transformers==4.44.2
+sentence-transformers==2.2.2
+spacy==3.8.2
+scikit-learn==1.5.2
+numpy==1.26.4
+pandas==2.2.3
+nltk==3.9.1
+# Retrieval / Search
+faiss-cpu==1.7.4
+rank-bm25==0.2.2
+# File parsing (PDF, DOCX, OCR)
+PyPDF2==3.0.1
+pdfplumber==0.11.4
+python-docx==1.1.2
+pytesseract==0.3.13
+easyocr==1.7.1
+pdf2image==1.16.3
+opencv-python==4.10.0.84
+Pillow==10.4.0
+# API clients
+groq==0.13.0

simple/ner.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import spacy
+from huggingface_hub import snapshot_download
+from typing import Dict, Any
+def extract_legal_entities(text, model_id=None, hf_token=None):
+    """
+    Extract named entities from legal text
+    Args:
+        text: Input text to process
+        model_id: Optional Hugging Face model ID (defaults to en_core_web_sm)
+        hf_token: Optional Hugging Face token
+    Returns:
+        Dictionary with entities and counts
+    """
+    if not text or not text.strip():
+        return {
+            "error": "Empty text provided",
+            "entities": [],
+            "entity_counts": {},
+            "total_entities": 0
+        }
+    # Load model
+    nlp = _load_ner_model(model_id, hf_token)
+    if not nlp:
+        return {
+            "error": "Failed to load NER model",
+            "entities": [],
+            "entity_counts": {},
+            "total_entities": 0
+        }
+    try:
+        # Process text (handle large texts by chunking)
+        if len(text) > 4000000:
+            return _process_large_text(text, nlp)
+        doc = nlp(text)
+        entities = []
+        entity_counts = {}
+        for ent in doc.ents:
+            processed_entities = _process_entity(ent)
+            for entity_text, entity_label in processed_entities:
+                entity_info = {
+                    "text": entity_text,
+                    "label": entity_label,
+                    "start": ent.start_char,
+                    "end": ent.end_char
+                }
+                entities.append(entity_info)
+                if entity_label not in entity_counts:
+                    entity_counts[entity_label] = []
+                entity_counts[entity_label].append(entity_text)
+        # Process counts
+        for label in entity_counts:
+            unique_entities = list(set(entity_counts[label]))
+            entity_counts[label] = {
+                "entities": unique_entities,
+                "count": len(unique_entities)
+            }
+        return {
+            "entities": entities,
+            "entity_counts": entity_counts,
+            "total_entities": len(entities),
+            "unique_labels": list(entity_counts.keys())
+        }
+    except Exception as e:
+        return {
+            "error": str(e),
+            "entities": [],
+            "entity_counts": {},
+            "total_entities": 0
+        }
+def _load_ner_model(model_id, hf_token):
+    """Load spaCy NER model"""
+    if not model_id:
+        model_id = 'en_core_web_sm'
+    try:
+        # Try loading from Hugging Face
+        if model_id != 'en_core_web_sm':
+            local_dir = snapshot_download(
+                repo_id=model_id,
+                token=hf_token if hf_token else None
+            )
+            return spacy.load(local_dir)
+        else:
+            # Load standard model
+            return spacy.load("en_core_web_sm")
+    except Exception:
+        # Fallback to standard English model
+        try:
+            return spacy.load("en_core_web_sm")
+        except Exception:
+            return None
+def _process_large_text(text, nlp, chunk_size=3000000):
+    """Process large text by chunking"""
+    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+    all_entities = []
+    all_entity_counts = {}
+    for i, chunk in enumerate(chunks):
+        try:
+            doc = nlp(chunk)
+            for ent in doc.ents:
+                processed_entities = _process_entity(ent)
+                for entity_text, entity_label in processed_entities:
+                    entity_info = {
+                        "text": entity_text,
+                        "label": entity_label,
+                        "start": ent.start_char + (i * chunk_size),
+                        "end": ent.end_char + (i * chunk_size)
+                    }
+                    all_entities.append(entity_info)
+                    if entity_label not in all_entity_counts:
+                        all_entity_counts[entity_label] = []
+                    all_entity_counts[entity_label].append(entity_text)
+        except Exception:
+            continue
+    # Process counts
+    for label in all_entity_counts:
+        unique_entities = list(set(all_entity_counts[label]))
+        all_entity_counts[label] = {
+            "entities": unique_entities,
+            "count": len(unique_entities)
+        }
+    return {
+        "entities": all_entities,
+        "entity_counts": all_entity_counts,
+        "total_entities": len(all_entities),
+        "unique_labels": list(all_entity_counts.keys()),
+        "processed_in_chunks": True,
+        "num_chunks": len(chunks)
+    }
+def _process_entity(ent):
+    """Process individual entity (handle special cases like 'X and Y')"""
+    if ent.label_ in ["PRECEDENT", "ORG"] and " and " in ent.text:
+        parts = ent.text.split(" and ")
+        return [(p.strip(), "ORG") for p in parts]
+    return [(ent.text, ent.label_)]

simple/rag.py ADDED Viewed

	@@ -0,0 +1,593 @@

+import torch
+import numpy as np
+from transformers import AutoTokenizer, AutoModel
+from typing import List, Dict, Any, Tuple, Optional
+import faiss
+import hashlib
+from tqdm import tqdm
+from groq import Groq
+import re
+import nltk
+from sklearn.metrics.pairwise import cosine_similarity
+import networkx as nx
+from collections import defaultdict
+import spacy
+from rank_bm25 import BM25Okapi
+# Global variables for models
+MODEL = None
+TOKENIZER = None
+GROQ_CLIENT = None
+NLP_MODEL = None
+DEVICE = None
+# Global indices
+DENSE_INDEX = None
+BM25_INDEX = None
+CONCEPT_GRAPH = None
+TOKEN_TO_CHUNKS = None
+CHUNKS_DATA = []
+# Legal knowledge base
+LEGAL_CONCEPTS = {
+    'liability': ['negligence', 'strict liability', 'vicarious liability', 'product liability'],
+    'contract': ['breach', 'consideration', 'offer', 'acceptance', 'damages', 'specific performance'],
+    'criminal': ['mens rea', 'actus reus', 'intent', 'malice', 'premeditation'],
+    'procedure': ['jurisdiction', 'standing', 'statute of limitations', 'res judicata'],
+    'evidence': ['hearsay', 'relevance', 'privilege', 'burden of proof', 'admissibility'],
+    'constitutional': ['due process', 'equal protection', 'free speech', 'search and seizure']
+}
+QUERY_PATTERNS = {
+    'precedent': ['case', 'precedent', 'ruling', 'held', 'decision'],
+    'statute_interpretation': ['statute', 'section', 'interpretation', 'meaning', 'definition'],
+    'factual': ['what happened', 'facts', 'circumstances', 'events'],
+    'procedure': ['how to', 'procedure', 'process', 'filing', 'requirements']
+}
+def initialize_models(model_id: str, groq_api_key: str = None):
+    """Initialize all models and components"""
+    global MODEL, TOKENIZER, GROQ_CLIENT, NLP_MODEL, DEVICE
+    try:
+        nltk.download('punkt', quiet=True)
+        nltk.download('stopwords', quiet=True)
+    except:
+        pass
+    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Using device: {DEVICE}")
+    print(f"Loading model: {model_id}")
+    TOKENIZER = AutoTokenizer.from_pretrained(model_id)
+    MODEL = AutoModel.from_pretrained(model_id).to(DEVICE)
+    MODEL.eval()
+    if groq_api_key:
+        GROQ_CLIENT = Groq(api_key=groq_api_key)
+    try:
+        NLP_MODEL = spacy.load("en_core_web_sm")
+    except:
+        print("SpaCy model not found, using basic NER")
+        NLP_MODEL = None
+def create_embedding(text: str) -> np.ndarray:
+    """Create dense embedding for text"""
+    inputs = TOKENIZER(text, padding=True, truncation=True,
+                      max_length=512, return_tensors='pt').to(DEVICE)
+    with torch.no_grad():
+        outputs = MODEL(**inputs)
+        attention_mask = inputs['attention_mask']
+        token_embeddings = outputs.last_hidden_state
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        # Normalize embeddings
+        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+    return embeddings.cpu().numpy()[0]
+def extract_legal_entities(text: str) -> List[Dict[str, Any]]:
+    """Extract legal entities from text"""
+    entities = []
+    if NLP_MODEL:
+        doc = NLP_MODEL(text[:5000])  # Limit for performance
+        for ent in doc.ents:
+            if ent.label_ in ['PERSON', 'ORG', 'LAW', 'GPE']:
+                entities.append({
+                    'text': ent.text,
+                    'type': ent.label_,
+                    'importance': 1.0
+                })
+    # Legal citations
+    citation_pattern = r'\b\d+\s+[A-Z][a-z]+\.?\s+\d+\b'
+    for match in re.finditer(citation_pattern, text):
+        entities.append({
+            'text': match.group(),
+            'type': 'case_citation',
+            'importance': 2.0
+        })
+    # Statute references
+    statute_pattern = r'§\s*\d+[\.\d]*|\bSection\s+\d+'
+    for match in re.finditer(statute_pattern, text):
+        entities.append({
+            'text': match.group(),
+            'type': 'statute',
+            'importance': 1.5
+        })
+    return entities
+def analyze_query(query: str) -> Dict[str, Any]:
+    """Analyze query to understand intent"""
+    query_lower = query.lower()
+    # Classify query type
+    query_type = 'general'
+    for qtype, patterns in QUERY_PATTERNS.items():
+        if any(pattern in query_lower for pattern in patterns):
+            query_type = qtype
+            break
+    # Extract entities
+    entities = extract_legal_entities(query)
+    # Extract key concepts
+    key_concepts = []
+    for concept_category, concepts in LEGAL_CONCEPTS.items():
+        for concept in concepts:
+            if concept in query_lower:
+                key_concepts.append(concept)
+    # Generate expanded queries
+    expanded_queries = [query]
+    # Concept expansion
+    if key_concepts:
+        expanded_queries.append(f"{query} {' '.join(key_concepts[:3])}")
+    # Type-based expansion
+    if query_type == 'precedent':
+        expanded_queries.append(f"legal precedent case law {query}")
+    elif query_type == 'statute_interpretation':
+        expanded_queries.append(f"statutory interpretation meaning {query}")
+    # HyDE - Hypothetical document generation
+    if GROQ_CLIENT:
+        hyde_doc = generate_hypothetical_document(query)
+        if hyde_doc:
+            expanded_queries.append(hyde_doc)
+    return {
+        'original_query': query,
+        'query_type': query_type,
+        'entities': entities,
+        'key_concepts': key_concepts,
+        'expanded_queries': expanded_queries[:4]  # Limit to 4 queries
+    }
+def generate_hypothetical_document(query: str) -> Optional[str]:
+    """Generate hypothetical answer document (HyDE technique)"""
+    if not GROQ_CLIENT:
+        return None
+    try:
+        prompt = f"""Generate a brief hypothetical legal document excerpt that would answer this question: {query}
+        Write it as if it's from an actual legal case or statute. Be specific and use legal language.
+        Keep it under 100 words."""
+        response = GROQ_CLIENT.chat.completions.create(
+            messages=[
+                {"role": "system", "content": "You are a legal expert generating hypothetical legal text."},
+                {"role": "user", "content": prompt}
+            ],
+            model="llama-3.1-8b-instant",
+            temperature=0.3,
+            max_tokens=150
+        )
+        return response.choices[0].message.content
+    except:
+        return None
+def chunk_text_hierarchical(text: str, title: str = "") -> List[Dict[str, Any]]:
+    """Create hierarchical chunks with legal structure awareness"""
+    chunks = []
+    # Clean text
+    text = re.sub(r'\s+', ' ', text)
+    # Identify legal sections
+    section_patterns = [
+        (r'(?i)\bFACTS?\b[:\s]', 'facts'),
+        (r'(?i)\bHOLDING\b[:\s]', 'holding'),
+        (r'(?i)\bREASONING\b[:\s]', 'reasoning'),
+        (r'(?i)\bDISSENT\b[:\s]', 'dissent'),
+        (r'(?i)\bCONCLUSION\b[:\s]', 'conclusion')
+    ]
+    sections = []
+    for pattern, section_type in section_patterns:
+        matches = list(re.finditer(pattern, text))
+        for match in matches:
+            sections.append((match.start(), section_type))
+    sections.sort(key=lambda x: x[0])
+    # Split into sentences
+    import nltk
+    try:
+        sentences = nltk.sent_tokenize(text)
+    except:
+        sentences = text.split('. ')
+    # Create chunks
+    current_section = 'introduction'
+    section_sentences = []
+    chunk_size = 500  # words
+    for sent in sentences:
+        # Check section type
+        sent_pos = text.find(sent)
+        for pos, stype in sections:
+            if sent_pos >= pos:
+                current_section = stype
+        section_sentences.append(sent)
+        # Create chunk when we have enough content
+        chunk_text = ' '.join(section_sentences)
+        if len(chunk_text.split()) >= chunk_size or len(section_sentences) >= 10:
+            chunk_id = hashlib.md5(f"{title}_{len(chunks)}_{chunk_text[:50]}".encode()).hexdigest()[:12]
+            # Calculate importance
+            importance = 1.0
+            section_weights = {
+                'holding': 2.0, 'conclusion': 1.8, 'reasoning': 1.5,
+                'facts': 1.2, 'dissent': 0.8
+            }
+            importance *= section_weights.get(current_section, 1.0)
+            # Entity importance
+            entities = extract_legal_entities(chunk_text)
+            if entities:
+                entity_score = sum(e['importance'] for e in entities) / len(entities)
+                importance *= (1 + entity_score * 0.5)
+            chunks.append({
+                'id': chunk_id,
+                'text': chunk_text,
+                'title': title,
+                'section_type': current_section,
+                'importance_score': importance,
+                'entities': entities,
+                'embedding': None  # Will be filled during indexing
+            })
+            section_sentences = []
+    # Add remaining sentences
+    if section_sentences:
+        chunk_text = ' '.join(section_sentences)
+        chunk_id = hashlib.md5(f"{title}_{len(chunks)}_{chunk_text[:50]}".encode()).hexdigest()[:12]
+        chunks.append({
+            'id': chunk_id,
+            'text': chunk_text,
+            'title': title,
+            'section_type': current_section,
+            'importance_score': 1.0,
+            'entities': extract_legal_entities(chunk_text),
+            'embedding': None
+        })
+    return chunks
+def build_all_indices(chunks: List[Dict[str, Any]]):
+    """Build all retrieval indices"""
+    global DENSE_INDEX, BM25_INDEX, CONCEPT_GRAPH, TOKEN_TO_CHUNKS, CHUNKS_DATA
+    CHUNKS_DATA = chunks
+    print(f"Building indices for {len(chunks)} chunks...")
+    # 1. Dense embeddings + FAISS index
+    print("Building FAISS index...")
+    embeddings = []
+    for chunk in tqdm(chunks, desc="Creating embeddings"):
+        embedding = create_embedding(chunk['text'])
+        chunk['embedding'] = embedding
+        embeddings.append(embedding)
+    embeddings_matrix = np.vstack(embeddings)
+    DENSE_INDEX = faiss.IndexFlatIP(embeddings_matrix.shape[1])  # Inner product for normalized vectors
+    DENSE_INDEX.add(embeddings_matrix.astype('float32'))
+    # 2. BM25 index for sparse retrieval
+    print("Building BM25 index...")
+    tokenized_corpus = [chunk['text'].lower().split() for chunk in chunks]
+    BM25_INDEX = BM25Okapi(tokenized_corpus)
+    # 3. ColBERT-style token index
+    print("Building ColBERT token index...")
+    TOKEN_TO_CHUNKS = defaultdict(set)
+    for i, chunk in enumerate(chunks):
+        # Simple tokenization for token-level matching
+        tokens = chunk['text'].lower().split()
+        for token in tokens:
+            TOKEN_TO_CHUNKS[token].add(i)
+    # 4. Legal concept graph
+    print("Building legal concept graph...")
+    CONCEPT_GRAPH = nx.Graph()
+    for i, chunk in enumerate(chunks):
+        CONCEPT_GRAPH.add_node(i, text=chunk['text'][:200], importance=chunk['importance_score'])
+        # Add edges between chunks with shared entities
+        for j, other_chunk in enumerate(chunks[i+1:], i+1):
+            shared_entities = set(e['text'] for e in chunk['entities']) & \
+                            set(e['text'] for e in other_chunk['entities'])
+            if shared_entities:
+                CONCEPT_GRAPH.add_edge(i, j, weight=len(shared_entities))
+    print("All indices built successfully!")
+def multi_stage_retrieval(query_analysis: Dict[str, Any], top_k: int = 10) -> List[Tuple[Dict[str, Any], float]]:
+    """Perform multi-stage retrieval combining all techniques"""
+    candidates = {}
+    print("Performing multi-stage retrieval...")
+    # Stage 1: Dense retrieval with expanded queries
+    print("Stage 1: Dense retrieval...")
+    for query in query_analysis['expanded_queries'][:3]:
+        query_emb = create_embedding(query)
+        scores, indices = DENSE_INDEX.search(
+            query_emb.reshape(1, -1).astype('float32'),
+            top_k * 2
+        )
+        for idx, score in zip(indices[0], scores[0]):
+            if idx < len(CHUNKS_DATA):
+                chunk_id = CHUNKS_DATA[idx]['id']
+                if chunk_id not in candidates:
+                    candidates[chunk_id] = {'chunk': CHUNKS_DATA[idx], 'scores': {}}
+                candidates[chunk_id]['scores']['dense'] = float(score)
+    # Stage 2: Sparse retrieval (BM25)
+    print("Stage 2: Sparse retrieval...")
+    query_tokens = query_analysis['original_query'].lower().split()
+    bm25_scores = BM25_INDEX.get_scores(query_tokens)
+    top_bm25_indices = np.argsort(bm25_scores)[-top_k*2:][::-1]
+    for idx in top_bm25_indices:
+        if idx < len(CHUNKS_DATA):
+            chunk_id = CHUNKS_DATA[idx]['id']
+            if chunk_id not in candidates:
+                candidates[chunk_id] = {'chunk': CHUNKS_DATA[idx], 'scores': {}}
+            candidates[chunk_id]['scores']['bm25'] = float(bm25_scores[idx])
+    # Stage 3: Entity-based retrieval
+    print("Stage 3: Entity-based retrieval...")
+    for entity in query_analysis['entities']:
+        for chunk in CHUNKS_DATA:
+            chunk_entity_texts = [e['text'].lower() for e in chunk['entities']]
+            if entity['text'].lower() in chunk_entity_texts:
+                chunk_id = chunk['id']
+                if chunk_id not in candidates:
+                    candidates[chunk_id] = {'chunk': chunk, 'scores': {}}
+                candidates[chunk_id]['scores']['entity'] = \
+                    candidates[chunk_id]['scores'].get('entity', 0) + entity['importance']
+    # Stage 4: Graph-based retrieval
+    print("Stage 4: Graph-based retrieval...")
+    if candidates and CONCEPT_GRAPH:
+        seed_chunks = []
+        for chunk_id, data in list(candidates.items())[:5]:
+            for i, chunk in enumerate(CHUNKS_DATA):
+                if chunk['id'] == chunk_id:
+                    seed_chunks.append(i)
+                    break
+        for seed_idx in seed_chunks:
+            if seed_idx in CONCEPT_GRAPH:
+                neighbors = list(CONCEPT_GRAPH.neighbors(seed_idx))[:3]
+                for neighbor_idx in neighbors:
+                    if neighbor_idx < len(CHUNKS_DATA):
+                        chunk = CHUNKS_DATA[neighbor_idx]
+                        chunk_id = chunk['id']
+                        if chunk_id not in candidates:
+                            candidates[chunk_id] = {'chunk': chunk, 'scores': {}}
+                            candidates[chunk_id]['scores']['graph'] = 0.5
+    # Combine scores
+    print("Combining scores...")
+    weights = {'dense': 0.35, 'bm25': 0.25, 'entity': 0.25, 'graph': 0.15}
+    final_scores = []
+    for chunk_id, data in candidates.items():
+        chunk = data['chunk']
+        scores = data['scores']
+        final_score = 0
+        for method, weight in weights.items():
+            if method in scores:
+                # Normalize scores
+                if method == 'dense':
+                    normalized = (scores[method] + 1) / 2  # [-1, 1] to [0, 1]
+                elif method == 'bm25':
+                    normalized = min(scores[method] / 10, 1)
+                elif method == 'entity':
+                    normalized = min(scores[method] / 3, 1)
+                else:
+                    normalized = scores[method]
+                final_score += weight * normalized
+        # Boost by importance and section relevance
+        final_score *= chunk['importance_score']
+        if query_analysis['query_type'] == 'precedent' and chunk['section_type'] == 'holding':
+            final_score *= 1.5
+        elif query_analysis['query_type'] == 'factual' and chunk['section_type'] == 'facts':
+            final_score *= 1.5
+        final_scores.append((chunk, final_score))
+    # Sort and return top-k
+    final_scores.sort(key=lambda x: x[1], reverse=True)
+    return final_scores[:top_k]
+def generate_answer_with_reasoning(query: str, retrieved_chunks: List[Tuple[Dict[str, Any], float]]) -> Dict[str, Any]:
+    """Generate answer with legal reasoning"""
+    if not GROQ_CLIENT:
+        return {'error': 'Groq client not initialized'}
+    # Prepare context
+    context_parts = []
+    for i, (chunk, score) in enumerate(retrieved_chunks, 1):
+        entities = ', '.join([e['text'] for e in chunk['entities'][:3]])
+        context_parts.append(f"""
+Document {i} [{chunk['title']}] - Relevance: {score:.2f}
+Section: {chunk['section_type']}
+Key Entities: {entities}
+Content: {chunk['text'][:800]}
+""")
+    context = "\n---\n".join(context_parts)
+    system_prompt = """You are an expert legal analyst. Provide thorough legal analysis using the IRAC method:
+1. ISSUE: Identify the legal issue(s)
+2. RULE: State the applicable legal rules/precedents
+3. APPLICATION: Apply the rules to the facts
+4. CONCLUSION: Provide a clear conclusion
+CRITICAL: Base ALL responses on the provided document excerpts only. Quote directly when making claims.
+If information is not in the excerpts, state "This information is not provided in the available documents."
+"""
+    user_prompt = f"""Query: {query}
+Retrieved Legal Documents:
+{context}
+Please provide a comprehensive legal analysis using IRAC method. Cite the documents when making claims."""
+    try:
+        response = GROQ_CLIENT.chat.completions.create(
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            model="llama-3.1-8b-instant",
+            temperature=0.1,
+            max_tokens=1000
+        )
+        answer = response.choices[0].message.content
+        # Calculate confidence
+        avg_score = sum(score for _, score in retrieved_chunks[:3]) / min(3, len(retrieved_chunks))
+        confidence = min(avg_score * 100, 100)
+        return {
+            'answer': answer,
+            'confidence': confidence,
+            'sources': [
+                {
+                    'chunk_id': chunk['id'],
+                    'title': chunk['title'],
+                    'section': chunk['section_type'],
+                    'relevance_score': float(score),
+                    'excerpt': chunk['text'][:200] + '...',
+                    'entities': [e['text'] for e in chunk['entities'][:5]]
+                }
+                for chunk, score in retrieved_chunks
+            ]
+        }
+    except Exception as e:
+        return {
+            'error': f'Error generating answer: {str(e)}',
+            'sources': [{'chunk': c['text'][:200], 'score': s} for c, s in retrieved_chunks[:3]]
+        }
+# Main functions for external use
+def process_documents(documents: List[Dict[str, str]]) -> Dict[str, Any]:
+    """Process documents and build indices"""
+    all_chunks = []
+    for doc in documents:
+        chunks = chunk_text_hierarchical(doc['text'], doc.get('title', 'Document'))
+        all_chunks.extend(chunks)
+    build_all_indices(all_chunks)
+    return {
+        'success': True,
+        'chunk_count': len(all_chunks),
+        'message': f'Processed {len(documents)} documents into {len(all_chunks)} chunks'
+    }
+def query_documents(query: str, top_k: int = 5) -> Dict[str, Any]:
+    """Main query function - takes query, returns answer with sources"""
+    if not CHUNKS_DATA:
+        return {'error': 'No documents indexed. Call process_documents first.'}
+    # Analyze query
+    query_analysis = analyze_query(query)
+    # Multi-stage retrieval
+    retrieved_chunks = multi_stage_retrieval(query_analysis, top_k)
+    if not retrieved_chunks:
+        return {
+            'error': 'No relevant documents found',
+            'query_analysis': query_analysis
+        }
+    # Generate answer
+    result = generate_answer_with_reasoning(query, retrieved_chunks)
+    result['query_analysis'] = query_analysis
+    return result
+def search_chunks_simple(query: str, top_k: int = 3) -> List[Dict[str, Any]]:
+    """Simple search function for compatibility"""
+    if not CHUNKS_DATA:
+        return []
+    query_analysis = analyze_query(query)
+    retrieved_chunks = multi_stage_retrieval(query_analysis, top_k)
+    results = []
+    for chunk, score in retrieved_chunks:
+        results.append({
+            'chunk': {
+                'id': chunk['id'],
+                'text': chunk['text'],
+                'title': chunk['title']
+            },
+            'score': score
+        })
+    return results
+def generate_conservative_answer(query: str, context_chunks: List[Dict[str, Any]]) -> str:
+    """Generate conservative answer - for compatibility"""
+    if not context_chunks:
+        return "No relevant information found."
+    # Convert format
+    retrieved_chunks = [(chunk['chunk'], chunk['score']) for chunk in context_chunks]
+    result = generate_answer_with_reasoning(query, retrieved_chunks)
+    if 'error' in result:
+        return result['error']
+    return result.get('answer', 'Unable to generate answer.')

simple/summarizer.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+from groq import Groq
+import re
+from nltk.tokenize import sent_tokenize
+import nltk
+# Download required NLTK data
+try:
+    nltk.download('punkt', quiet=True)
+    nltk.download('punkt_tab', quiet=True)
+except:
+    pass
+def summarize_legal_document(text, max_sentences=5, groq_api_key=None, model_path=None):
+    """
+    Summarize legal document text
+    Args:
+        text: Input text to summarize
+        max_sentences: Maximum number of sentences in summary
+        groq_api_key: Optional Groq API key for enhanced summarization
+        model_path: Optional custom model path
+    Returns:
+        Dictionary with summary and metadata
+    """
+    if not text or not text.strip():
+        return {"error": "Empty text provided", "success": False}
+    max_sentences = max(3, min(max_sentences, 20))
+    # Initialize result
+    result = {
+        "original_length": len(text),
+        "word_count": len(text.split()),
+        "sentence_count": len(sent_tokenize(text)),
+        "success": False
+    }
+    try:
+        # Always generate extractive summary
+        extractive_summary = _extractive_summarize(text, max_sentences)
+        result["summary"] = extractive_summary
+        # Try Groq enhancement
+        if groq_api_key:
+            try:
+                groq_summary = _groq_summarize(text, max_sentences, groq_api_key)
+                if groq_summary:
+                    result["summary"] = groq_summary
+            except Exception:
+                pass
+        # Calculate final metrics
+        final_summary = result.get("summary", "")
+        result["summary_length"] = len(final_summary)
+        result["compression_ratio"] = (
+            result["summary_length"] / result["original_length"]
+            if result["original_length"] > 0 else 0
+        )
+        result["success"] = True
+    except Exception as e:
+        result["error"] = str(e)
+        result["success"] = False
+    return result
+def _extractive_summarize(text, max_sentences):
+    """Extract key sentences based on legal document scoring"""
+    sentences = sent_tokenize(text)
+    if len(sentences) <= max_sentences:
+        return text
+    legal_keywords = [
+        'court', 'judge', 'plaintiff', 'defendant', 'appellant', 'respondent',
+        'held', 'ruled', 'decided', 'judgment', 'order', 'section', 'article',
+        'provision', 'law', 'legal', 'case', 'appeal', 'petition', 'writ',
+        'contract', 'agreement', 'liability', 'damages', 'evidence', 'witness',
+        'statute', 'regulation', 'finding', 'conclusion', 'reasoning'
+    ]
+    sentence_scores = []
+    for i, sentence in enumerate(sentences):
+        if not sentence.strip():
+            continue
+        score = 0
+        sentence_lower = sentence.lower()
+        # Keyword scoring
+        for keyword in legal_keywords:
+            if keyword in sentence_lower:
+                score += 1
+        # Position scoring
+        if i == 0:
+            score += 3
+        elif i == len(sentences) - 1:
+            score += 2
+        elif i < len(sentences) * 0.2:
+            score += 1
+        # Length scoring
+        word_count = len(sentence.split())
+        if 15 <= word_count <= 40:
+            score += 2
+        elif 10 <= word_count <= 50:
+            score += 1
+        # Numbers and dates
+        if re.search(r'\b\d{4}\b|\b\d+\s*(percent|%|\$)', sentence):
+            score += 1
+        # Legal citations
+        if re.search(r'\d+\s+[A-Z][a-z]+\.?\s+\d+|\bv\.\s+[A-Z]', sentence):
+            score += 2
+        sentence_scores.append((score, i, sentence))
+    # Select top sentences
+    sentence_scores.sort(reverse=True, key=lambda x: x[0])
+    selected_sentences = sentence_scores[:max_sentences]
+    # Sort by original order
+    selected_sentences.sort(key=lambda x: x[1])
+    return ' '.join([sent[2] for sent in selected_sentences])
+def _groq_summarize(text, max_sentences, api_key):
+    """Enhanced summarization using Groq LLM"""
+    try:
+        client = Groq(api_key=api_key)
+        # Truncate if too long
+        if len(text) > 6000:
+            text = text[:6000] + "\n[...text truncated...]"
+        system_prompt = """You are an expert legal document summarizer. Create concise, accurate summaries that capture the most important information.
+Guidelines:
+1. Focus on key legal facts, holdings, and conclusions
+2. Preserve important legal terminology and concepts
+3. Maintain logical flow of legal reasoning
+4. Include relevant case citations, statutes, or regulations
+5. Be precise and avoid unnecessary elaboration"""
+        user_prompt = f"""Please summarize the following legal document in approximately {max_sentences} sentences:
+{text}
+Provide a clear, concise summary:"""
+        response = client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            model="llama-3.1-8b-instant",
+            temperature=0.2,
+            max_tokens=800,
+            top_p=0.9
+        )
+        summary = response.choices[0].message.content.strip()
+        if summary and len(summary) > 20:
+            return summary
+    except Exception:
+        pass
+    return None
+def _chunk_text(text, max_words):
+    """Split text into chunks for processing"""
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), max_words):
+        chunk_words = words[i:i + max_words]
+        if chunk_words:
+            chunks.append(' '.join(chunk_words))
+    return chunks