manabb commited on
Commit
b954e3d
Β·
verified Β·
1 Parent(s): 87a3c7e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -5
app.py CHANGED
@@ -49,6 +49,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
49
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
50
 
51
  # Wrap in pipeline
 
52
  pipe1 = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
53
  if llm1 is None:
54
  llm1 = HuggingFacePipeline(pipeline=pipe1)
@@ -75,6 +76,13 @@ if llm is None:
75
  llm = HuggingFacePipeline(pipeline=pipe)
76
  #=============================================
77
 
 
 
 
 
 
 
 
78
  def create_faiss_index(repo_id, file, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
79
  """Create FAISS index from PDF and upload to HF dataset repo"""
80
  message = "Index creation started"
@@ -284,7 +292,33 @@ def upload_and_prepare_old(file,user):
284
  #return mm
285
  #create_faiss_index(repo_id, file_input)
286
  #======================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
 
 
288
  def generate_qa_chain(repo_id, embedding_model="sentence-transformers/all-MiniLM-L6-v2", llm=None):
289
  """
290
  Generate QA chain from HF dataset repo FAISS index
@@ -370,6 +404,7 @@ def ask_question(query):
370
 
371
  response = qa_chain.invoke({"query": query})
372
  result = response["result"]
 
373
  sources = response.get("source_documents", [])
374
 
375
  source_info = ""
@@ -379,7 +414,7 @@ def ask_question(query):
379
  repo_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/docs/{filename}"
380
  source_info += f"\n**Source {i+1}:** [{filename} (Page {page_num})]({repo_url})"
381
 
382
- return f"{result}\n\n**πŸ“„ Sources:**{source_info}"
383
 
384
  def ask_question1(query):
385
  if not query or not qa_chain1:
@@ -387,6 +422,7 @@ def ask_question1(query):
387
 
388
  response = qa_chain1.invoke({"query": query})
389
  result = response["result"]
 
390
  sources = response.get("source_documents", [])
391
 
392
  source_info = ""
@@ -396,7 +432,7 @@ def ask_question1(query):
396
  repo_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/docs/{filename}"
397
  source_info += f"\n**Source {i+1}:** [{filename} (Page {page_num})]({repo_url})"
398
 
399
- return f"{result}\n\n**πŸ“„ Sources:**{source_info}"
400
  #===============================================
401
  #delete entire repo
402
  def delete_entire_repo(user):
@@ -445,7 +481,7 @@ with gr.Blocks(title="N R L C H A T B O T - for commercial procurement - Supply"
445
  """) as demo:
446
  gr.Markdown("## 🧠 For use of NRL procurement department Only")
447
  with gr.Row():
448
- # LEFT COLUMN: Document Management
449
  with gr.Column(elem_id="blue-col",scale=1):
450
  gr.Markdown("## 🧠 Using heavy TinyLama Model")
451
  with gr.Row():
@@ -463,7 +499,7 @@ with gr.Blocks(title="N R L C H A T B O T - for commercial procurement - Supply"
463
  lines=8
464
  )
465
  query_btn.click(ask_question, inputs=query_input, outputs=answer_output)
466
- # RIGHT COLUMN: Document Management
467
  with gr.Column(elem_id="green-col",scale=2):
468
  gr.Markdown("## 🧠 Using ligth model - google flan-t5")
469
  Index_processing_output1=gr.Textbox(label="πŸ“ Status for google flan-t5", interactive=False)
@@ -475,7 +511,16 @@ with gr.Blocks(title="N R L C H A T B O T - for commercial procurement - Supply"
475
  label="βœ… Answer with Document Links",
476
  lines=8
477
  )
478
- query_btn1.click(ask_question1, inputs=query_input1, outputs=answer_output1)
 
 
 
 
 
 
 
 
 
479
 
480
  with gr.Row():
481
  # LEFT COLUMN: Document Management
 
49
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
50
 
51
  # Wrap in pipeline
52
+ #pipe1 = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
53
  pipe1 = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
54
  if llm1 is None:
55
  llm1 = HuggingFacePipeline(pipeline=pipe1)
 
76
  llm = HuggingFacePipeline(pipeline=pipe)
77
  #=============================================
78
 
79
+ def format_as_bullets(text):
80
+ """Convert answer to bullet points"""
81
+ lines = text.strip().split('\n')
82
+ bullet_lines = [f"β€’ {line.strip()}" for line in lines if line.strip()]
83
+ return '\n'.join(bullet_lines) if bullet_lines else text
84
+ #=============================================
85
+
86
  def create_faiss_index(repo_id, file, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
87
  """Create FAISS index from PDF and upload to HF dataset repo"""
88
  message = "Index creation started"
 
292
  #return mm
293
  #create_faiss_index(repo_id, file_input)
294
  #======================================================================
295
+ def get_document_summary(repo_id,query,llm=None):
296
+ """Generate summary of all documents in repo"""
297
+ try:
298
+ # Load vectorstore
299
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
300
+ faiss_path = hf_hub_download(repo_id=repo_id, filename="index.faiss", repo_type="dataset")
301
+ vectorstore = FAISS.load_local(os.path.dirname(faiss_path), embeddings, allow_dangerous_deserialization=True)
302
+
303
+ # Get top documents
304
+ docs = vectorstore.similarity_search(query, k=20)
305
+ context = "\n".join([doc.page_content[:200] for doc in docs])
306
+
307
+ # Summarize with your LLM
308
+ summary_prompt = f"""
309
+ Summarize these context:
310
+
311
+ {context[:3000]}
312
+
313
+ Summary:
314
+ """
315
+ summary = llm(summary_prompt) # Uses TinyLlama
316
+ return summary
317
+ except:
318
+ return "Summary unavailable"
319
 
320
+
321
+ #======================================================================
322
  def generate_qa_chain(repo_id, embedding_model="sentence-transformers/all-MiniLM-L6-v2", llm=None):
323
  """
324
  Generate QA chain from HF dataset repo FAISS index
 
404
 
405
  response = qa_chain.invoke({"query": query})
406
  result = response["result"]
407
+ bullet_result = format_as_bullets(result)
408
  sources = response.get("source_documents", [])
409
 
410
  source_info = ""
 
414
  repo_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/docs/{filename}"
415
  source_info += f"\n**Source {i+1}:** [{filename} (Page {page_num})]({repo_url})"
416
 
417
+ return f"{result}\n\n In bullet form \n{bullet_result}\n\n**πŸ“„ Sources:**{source_info}"
418
 
419
  def ask_question1(query):
420
  if not query or not qa_chain1:
 
422
 
423
  response = qa_chain1.invoke({"query": query})
424
  result = response["result"]
425
+ bullet_result = format_as_bullets(result)
426
  sources = response.get("source_documents", [])
427
 
428
  source_info = ""
 
432
  repo_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/docs/{filename}"
433
  source_info += f"\n**Source {i+1}:** [{filename} (Page {page_num})]({repo_url})"
434
 
435
+ return f"{result}\n\n In bullet form \n{bullet_result}\n\n**πŸ“„ Sources:**{source_info}"
436
  #===============================================
437
  #delete entire repo
438
  def delete_entire_repo(user):
 
481
  """) as demo:
482
  gr.Markdown("## 🧠 For use of NRL procurement department Only")
483
  with gr.Row():
484
+ # LEFT COLUMN: TinyLama Model
485
  with gr.Column(elem_id="blue-col",scale=1):
486
  gr.Markdown("## 🧠 Using heavy TinyLama Model")
487
  with gr.Row():
 
499
  lines=8
500
  )
501
  query_btn.click(ask_question, inputs=query_input, outputs=answer_output)
502
+ # RIGHT COLUMN: google\flan-t5
503
  with gr.Column(elem_id="green-col",scale=2):
504
  gr.Markdown("## 🧠 Using ligth model - google flan-t5")
505
  Index_processing_output1=gr.Textbox(label="πŸ“ Status for google flan-t5", interactive=False)
 
511
  label="βœ… Answer with Document Links",
512
  lines=8
513
  )
514
+ summary_output = gr.Markdown("**Summary will appear here**")
515
+ query_btn1.click(
516
+ ask_question1,
517
+ inputs=query_input1,
518
+ outputs=answer_output1
519
+ ).then( # Auto-trigger after answer
520
+ get_document_summary,
521
+ inputs=[repo_id=repo_id,query=query_input1,llm=llm1],
522
+ outputs=summary_output
523
+ )
524
 
525
  with gr.Row():
526
  # LEFT COLUMN: Document Management