Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,7 +22,7 @@ import pytesseract
|
|
| 22 |
|
| 23 |
# Import our models
|
| 24 |
from simple.rag import initialize_models, process_documents, create_embedding, chunk_text_hierarchical
|
| 25 |
-
from simple.ner import
|
| 26 |
from simple.summarizer import summarize_legal_document
|
| 27 |
|
| 28 |
# Configure logging
|
|
@@ -37,9 +37,12 @@ cleanup_task = None
|
|
| 37 |
# Configuration
|
| 38 |
MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://username:password@cluster.mongodb.net/")
|
| 39 |
DATABASE_NAME = os.getenv("DATABASE_NAME", "legal_rag_system")
|
| 40 |
-
|
|
|
|
| 41 |
GROQ_API_KEY = os.getenv("GROQ_API_KEY", None)
|
| 42 |
SESSION_EXPIRE_HOURS = int(os.getenv("SESSION_EXPIRE_HOURS", "24"))
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# Supported file types
|
| 45 |
SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.docx', '.doc'}
|
|
@@ -89,10 +92,17 @@ async def startup_event():
|
|
| 89 |
# Create indexes
|
| 90 |
await create_indexes()
|
| 91 |
|
| 92 |
-
# Initialize ML models
|
| 93 |
-
logger.info("π€ Loading
|
| 94 |
initialize_models(HF_MODEL_ID, GROQ_API_KEY)
|
| 95 |
-
logger.info("β
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
# Start cleanup task
|
| 98 |
cleanup_task = asyncio.create_task(periodic_cleanup())
|
|
@@ -250,13 +260,19 @@ async def process_document_pipeline(
|
|
| 250 |
{"$set": {"status": "processing", "updated_at": datetime.utcnow()}}
|
| 251 |
)
|
| 252 |
|
| 253 |
-
# Step 1: NER Processing
|
| 254 |
-
|
| 255 |
-
|
|
|
|
| 256 |
text,
|
| 257 |
-
model_id=
|
| 258 |
-
hf_token=os.getenv("HF_TOKEN") # optional, if your model is private
|
| 259 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
# Store NER results
|
| 262 |
await db.ner_results.insert_one({
|
|
@@ -267,7 +283,7 @@ async def process_document_pipeline(
|
|
| 267 |
})
|
| 268 |
|
| 269 |
# Step 2: Summarization
|
| 270 |
-
logger.info(f"π Running summarization for session {session_id}")
|
| 271 |
summary_results = summarize_legal_document(
|
| 272 |
text,
|
| 273 |
max_sentences=5,
|
|
@@ -283,7 +299,7 @@ async def process_document_pipeline(
|
|
| 283 |
})
|
| 284 |
|
| 285 |
# Step 3: Chunking and Embedding
|
| 286 |
-
logger.info(f"π§© Creating chunks and embeddings for session {session_id}")
|
| 287 |
chunks = chunk_text_hierarchical(text, filename)
|
| 288 |
|
| 289 |
# Create embeddings and store chunks
|
|
|
|
| 22 |
|
| 23 |
# Import our models
|
| 24 |
from simple.rag import initialize_models, process_documents, create_embedding, chunk_text_hierarchical
|
| 25 |
+
from simple.ner import process_text as run_ner
|
| 26 |
from simple.summarizer import summarize_legal_document
|
| 27 |
|
| 28 |
# Configure logging
|
|
|
|
| 37 |
# Configuration
|
| 38 |
MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://username:password@cluster.mongodb.net/")
|
| 39 |
DATABASE_NAME = os.getenv("DATABASE_NAME", "legal_rag_system")
|
| 40 |
+
# Hardcode embedding model per request
|
| 41 |
+
HF_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
|
| 42 |
GROQ_API_KEY = os.getenv("GROQ_API_KEY", None)
|
| 43 |
SESSION_EXPIRE_HOURS = int(os.getenv("SESSION_EXPIRE_HOURS", "24"))
|
| 44 |
+
# Optional HF token (if NER model is private)
|
| 45 |
+
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
|
| 46 |
|
| 47 |
# Supported file types
|
| 48 |
SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.docx', '.doc'}
|
|
|
|
| 92 |
# Create indexes
|
| 93 |
await create_indexes()
|
| 94 |
|
| 95 |
+
# Initialize ML models (embeddings / retrieval backbone)
|
| 96 |
+
logger.info(f"π€ Loading embedding model for RAG: {HF_MODEL_ID}")
|
| 97 |
initialize_models(HF_MODEL_ID, GROQ_API_KEY)
|
| 98 |
+
logger.info(f"β
Embedding model loaded: {HF_MODEL_ID}")
|
| 99 |
+
|
| 100 |
+
# Surface NER token presence (actual NER loads lazily in simple.ner)
|
| 101 |
+
if HUGGINGFACE_TOKEN:
|
| 102 |
+
os.environ["HUGGINGFACE_TOKEN"] = HUGGINGFACE_TOKEN
|
| 103 |
+
logger.info("π HUGGINGFACE_TOKEN detected for NER model access")
|
| 104 |
+
else:
|
| 105 |
+
logger.info("βΉοΈ No HUGGINGFACE_TOKEN provided (NER model assumed public)")
|
| 106 |
|
| 107 |
# Start cleanup task
|
| 108 |
cleanup_task = asyncio.create_task(periodic_cleanup())
|
|
|
|
| 260 |
{"$set": {"status": "processing", "updated_at": datetime.utcnow()}}
|
| 261 |
)
|
| 262 |
|
| 263 |
+
# Step 1: NER Processing (spaCy pipeline from Hugging Face)
|
| 264 |
+
ner_model_id = "kn29/my-ner-model"
|
| 265 |
+
logger.info(f"π Running NER for session {session_id} using model: {ner_model_id}")
|
| 266 |
+
ner_results = run_ner(
|
| 267 |
text,
|
| 268 |
+
model_id=ner_model_id
|
|
|
|
| 269 |
)
|
| 270 |
+
if ner_results.get("error"):
|
| 271 |
+
logger.error(f"β NER failed for session {session_id}: {ner_results['error']}")
|
| 272 |
+
else:
|
| 273 |
+
logger.info(
|
| 274 |
+
f"β
NER completed for session {session_id} β’ total_entities={ner_results.get('total_entities', 0)} β’ labels={len(ner_results.get('unique_labels', []))}"
|
| 275 |
+
)
|
| 276 |
|
| 277 |
# Store NER results
|
| 278 |
await db.ner_results.insert_one({
|
|
|
|
| 283 |
})
|
| 284 |
|
| 285 |
# Step 2: Summarization
|
| 286 |
+
logger.info(f"π Running summarization for session {session_id} (Groq={'on' if GROQ_API_KEY else 'off'})")
|
| 287 |
summary_results = summarize_legal_document(
|
| 288 |
text,
|
| 289 |
max_sentences=5,
|
|
|
|
| 299 |
})
|
| 300 |
|
| 301 |
# Step 3: Chunking and Embedding
|
| 302 |
+
logger.info(f"π§© Creating chunks and embeddings for session {session_id} using {HF_MODEL_ID}")
|
| 303 |
chunks = chunk_text_hierarchical(text, filename)
|
| 304 |
|
| 305 |
# Create embeddings and store chunks
|