kn29 commited on
Commit
3d5f4f7
Β·
verified Β·
1 Parent(s): eea74f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -12
app.py CHANGED
@@ -22,7 +22,7 @@ import pytesseract
22
 
23
  # Import our models
24
  from simple.rag import initialize_models, process_documents, create_embedding, chunk_text_hierarchical
25
- from simple.ner import extract_legal_entities
26
  from simple.summarizer import summarize_legal_document
27
 
28
  # Configure logging
@@ -37,9 +37,12 @@ cleanup_task = None
37
  # Configuration
38
  MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://username:password@cluster.mongodb.net/")
39
  DATABASE_NAME = os.getenv("DATABASE_NAME", "legal_rag_system")
40
- HF_MODEL_ID = os.getenv("HF_MODEL_ID", "sentence-transformers/all-MiniLM-L6-v2")
 
41
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", None)
42
  SESSION_EXPIRE_HOURS = int(os.getenv("SESSION_EXPIRE_HOURS", "24"))
 
 
43
 
44
  # Supported file types
45
  SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.docx', '.doc'}
@@ -89,10 +92,17 @@ async def startup_event():
89
  # Create indexes
90
  await create_indexes()
91
 
92
- # Initialize ML models
93
- logger.info("πŸ€– Loading ML models...")
94
  initialize_models(HF_MODEL_ID, GROQ_API_KEY)
95
- logger.info("βœ… Models loaded successfully")
 
 
 
 
 
 
 
96
 
97
  # Start cleanup task
98
  cleanup_task = asyncio.create_task(periodic_cleanup())
@@ -250,13 +260,19 @@ async def process_document_pipeline(
250
  {"$set": {"status": "processing", "updated_at": datetime.utcnow()}}
251
  )
252
 
253
- # Step 1: NER Processing
254
- logger.info(f"πŸ” Running NER for session {session_id}")
255
- ner_results = extract_legal_entities(
 
256
  text,
257
- model_id="kn29/my-ner-model",
258
- hf_token=os.getenv("HF_TOKEN") # optional, if your model is private
259
  )
 
 
 
 
 
 
260
 
261
  # Store NER results
262
  await db.ner_results.insert_one({
@@ -267,7 +283,7 @@ async def process_document_pipeline(
267
  })
268
 
269
  # Step 2: Summarization
270
- logger.info(f"πŸ“„ Running summarization for session {session_id}")
271
  summary_results = summarize_legal_document(
272
  text,
273
  max_sentences=5,
@@ -283,7 +299,7 @@ async def process_document_pipeline(
283
  })
284
 
285
  # Step 3: Chunking and Embedding
286
- logger.info(f"🧩 Creating chunks and embeddings for session {session_id}")
287
  chunks = chunk_text_hierarchical(text, filename)
288
 
289
  # Create embeddings and store chunks
 
22
 
23
  # Import our models
24
  from simple.rag import initialize_models, process_documents, create_embedding, chunk_text_hierarchical
25
+ from simple.ner import process_text as run_ner
26
  from simple.summarizer import summarize_legal_document
27
 
28
  # Configure logging
 
37
  # Configuration
38
  MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://username:password@cluster.mongodb.net/")
39
  DATABASE_NAME = os.getenv("DATABASE_NAME", "legal_rag_system")
40
+ # Hardcode embedding model per request
41
+ HF_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
42
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", None)
43
  SESSION_EXPIRE_HOURS = int(os.getenv("SESSION_EXPIRE_HOURS", "24"))
44
+ # Optional HF token (if NER model is private)
45
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
46
 
47
  # Supported file types
48
  SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.docx', '.doc'}
 
92
  # Create indexes
93
  await create_indexes()
94
 
95
+ # Initialize ML models (embeddings / retrieval backbone)
96
+ logger.info(f"πŸ€– Loading embedding model for RAG: {HF_MODEL_ID}")
97
  initialize_models(HF_MODEL_ID, GROQ_API_KEY)
98
+ logger.info(f"βœ… Embedding model loaded: {HF_MODEL_ID}")
99
+
100
+ # Surface NER token presence (actual NER loads lazily in simple.ner)
101
+ if HUGGINGFACE_TOKEN:
102
+ os.environ["HUGGINGFACE_TOKEN"] = HUGGINGFACE_TOKEN
103
+ logger.info("πŸ” HUGGINGFACE_TOKEN detected for NER model access")
104
+ else:
105
+ logger.info("ℹ️ No HUGGINGFACE_TOKEN provided (NER model assumed public)")
106
 
107
  # Start cleanup task
108
  cleanup_task = asyncio.create_task(periodic_cleanup())
 
260
  {"$set": {"status": "processing", "updated_at": datetime.utcnow()}}
261
  )
262
 
263
+ # Step 1: NER Processing (spaCy pipeline from Hugging Face)
264
+ ner_model_id = "kn29/my-ner-model"
265
+ logger.info(f"πŸ” Running NER for session {session_id} using model: {ner_model_id}")
266
+ ner_results = run_ner(
267
  text,
268
+ model_id=ner_model_id
 
269
  )
270
+ if ner_results.get("error"):
271
+ logger.error(f"❌ NER failed for session {session_id}: {ner_results['error']}")
272
+ else:
273
+ logger.info(
274
+ f"βœ… NER completed for session {session_id} β€’ total_entities={ner_results.get('total_entities', 0)} β€’ labels={len(ner_results.get('unique_labels', []))}"
275
+ )
276
 
277
  # Store NER results
278
  await db.ner_results.insert_one({
 
283
  })
284
 
285
  # Step 2: Summarization
286
+ logger.info(f"πŸ“„ Running summarization for session {session_id} (Groq={'on' if GROQ_API_KEY else 'off'})")
287
  summary_results = summarize_legal_document(
288
  text,
289
  max_sentences=5,
 
299
  })
300
 
301
  # Step 3: Chunking and Embedding
302
+ logger.info(f"🧩 Creating chunks and embeddings for session {session_id} using {HF_MODEL_ID}")
303
  chunks = chunk_text_hierarchical(text, filename)
304
 
305
  # Create embeddings and store chunks