akazmi commited on
Commit
0c58ac5
·
verified ·
1 Parent(s): 7aec612

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -56
app.py CHANGED
@@ -1,86 +1,85 @@
1
  import gradio as gr
2
- import pdfplumber
3
  import torch
4
- from sentence_transformers import SentenceTransformer
5
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
  import numpy as np
 
 
7
  from sklearn.metrics.pairwise import cosine_similarity
8
- import re
9
 
10
- # Load models
11
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
- embedder = SentenceTransformer("all-MiniLM-L6-v2", device=device)
13
 
14
- model_name = "google/flan-t5-base" # stronger than 'small'
 
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
17
- qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
18
 
19
- # Extract and clean PDF text
20
  def read_pdf(file_path):
21
  try:
22
  with pdfplumber.open(file_path) as pdf:
23
- text = "\n".join([page.extract_text() or "" for page in pdf.pages])
24
- return re.sub(r'\n+', '\n', text.strip())
25
  except Exception as e:
26
- return f" PDF reading failed: {e}"
27
 
28
- # Chunk the text into clean sentence-like blocks
29
- def chunk_text(text, max_length=500):
30
- sentences = re.split(r'(?<=[.!?])\s+', text)
31
- chunks = []
32
- current_chunk = ""
33
  for sentence in sentences:
34
- if len(current_chunk) + len(sentence) <= max_length:
35
- current_chunk += sentence + " "
36
  else:
37
- chunks.append(current_chunk.strip())
38
- current_chunk = sentence + " "
39
- if current_chunk:
40
- chunks.append(current_chunk.strip())
41
  return chunks
42
 
43
- # Embed and get top chunks
44
- def get_top_chunks(question, chunks, k=2):
45
- q_embed = embedder.encode([question])
46
- chunk_embeds = embedder.encode(chunks)
47
- sims = cosine_similarity(q_embed, chunk_embeds)[0]
48
- top_k_idx = np.argsort(sims)[-k:][::-1]
49
- return "\n\n".join([chunks[i] for i in top_k_idx])
50
 
51
- # Generate answer
52
- def answer_question(pdf_file, user_question):
53
- if not pdf_file or not user_question.strip():
54
- return "⚠️ Upload a PDF and enter your question."
 
 
55
 
56
- text = read_pdf(pdf_file.name)
57
- if not text or text.startswith(""):
58
- return text
59
 
60
- chunks = chunk_text(text)
61
- relevant = get_top_chunks(user_question, chunks)
62
 
63
  prompt = (
64
- f"You are a legal document assistant. Based on the context below, "
65
- f"answer the question briefly and clearly.\n\n"
66
- f"Context:\n{relevant}\n\n"
67
- f"Question: {user_question}\n\nAnswer:"
68
  )
69
 
70
  try:
71
- result = qa_pipeline(prompt, max_new_tokens=256, do_sample=False)
72
- return result[0]["generated_text"].split("Answer:")[-1].strip()
73
  except Exception as e:
74
- return f" Generation error: {e}"
75
 
76
- # Gradio interface
77
  with gr.Blocks() as demo:
78
- gr.Markdown("## 📚 Legal Document Q&A Assistant")
79
- pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
80
- question_input = gr.Textbox(label="Ask a question")
81
- answer_output = gr.Textbox(label="Answer", lines=8)
82
- ask_button = gr.Button("Get Answer")
83
-
84
- ask_button.click(answer_question, inputs=[pdf_input, question_input], outputs=answer_output)
85
 
86
  demo.launch()
 
1
  import gradio as gr
 
2
  import torch
3
+ import pdfplumber
4
+ import re
5
  import numpy as np
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
7
+ from sentence_transformers import SentenceTransformer
8
  from sklearn.metrics.pairwise import cosine_similarity
 
9
 
10
+ # ===== Load Embedding Model =====
11
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
 
12
 
13
+ # ===== Load QA Model =====
14
+ model_name = "mistralai/Mistral-7B-Instruct-v0.1"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto")
17
+ qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
18
 
19
+ # ===== Read PDF and Clean =====
20
  def read_pdf(file_path):
21
  try:
22
  with pdfplumber.open(file_path) as pdf:
23
+ return "\n".join(page.extract_text() or "" for page in pdf.pages)
 
24
  except Exception as e:
25
+ return f"Error reading PDF: {str(e)}"
26
 
27
+ # ===== Smart Sentence Chunking =====
28
+ def chunk_text(text, max_len=500):
29
+ sentences = re.split(r'(?<=[.؟!])\s+', text)
30
+ chunks, current = [], ""
 
31
  for sentence in sentences:
32
+ if len(current) + len(sentence) <= max_len:
33
+ current += sentence + " "
34
  else:
35
+ chunks.append(current.strip())
36
+ current = sentence + " "
37
+ if current:
38
+ chunks.append(current.strip())
39
  return chunks
40
 
41
+ # ===== Semantic Retrieval =====
42
+ def get_relevant_chunks(question, chunks, top_k=2):
43
+ q_vec = embedder.encode([question])
44
+ c_vecs = embedder.encode(chunks)
45
+ sims = cosine_similarity(q_vec, c_vecs)[0]
46
+ top_indices = np.argsort(sims)[-top_k:][::-1]
47
+ return "\n\n".join([chunks[i] for i in top_indices])
48
 
49
+ # ===== Generate Answer =====
50
+ def answer_question(file, question):
51
+ if not file:
52
+ return "⚠️ Please upload a PDF."
53
+ if not question.strip():
54
+ return "⚠️ Please enter a question."
55
 
56
+ raw_text = read_pdf(file.name)
57
+ if raw_text.startswith("Error"):
58
+ return raw_text
59
 
60
+ chunks = chunk_text(raw_text)
61
+ context = get_relevant_chunks(question, chunks)
62
 
63
  prompt = (
64
+ f"You are a legal expert. Based on the context below, answer the question in a detailed and explanatory manner.\n\n"
65
+ f"Context:\n{context}\n\n"
66
+ f"Question: {question}\n\n"
67
+ f"Answer:"
68
  )
69
 
70
  try:
71
+ response = qa_pipeline(prompt, max_new_tokens=300, do_sample=False, temperature=0.3)
72
+ return response[0]["generated_text"].split("Answer:")[-1].strip()
73
  except Exception as e:
74
+ return f"Error generating answer: {e}"
75
 
76
+ # ===== Gradio Interface =====
77
  with gr.Blocks() as demo:
78
+ gr.Markdown("## 📘 Document Question Answering (RAG-powered)")
79
+ file = gr.File(label="Upload PDF", file_types=[".pdf"])
80
+ question = gr.Textbox(label="Ask a question", placeholder="e.g., Is there any section for cost audit?")
81
+ answer = gr.Textbox(label="Answer", lines=10)
82
+ submit = gr.Button("Get Answer")
83
+ submit.click(fn=answer_question, inputs=[file, question], outputs=answer)
 
84
 
85
  demo.launch()