SujathaL commited on
Commit
5f0b01f
·
verified ·
1 Parent(s): 28341e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -16
app.py CHANGED
@@ -2,19 +2,17 @@ import streamlit as st
2
  import pdfplumber
3
  import faiss
4
  import numpy as np
 
5
  from sentence_transformers import SentenceTransformer
6
- from transformers import AutoModelForCausalLM, AutoTokenizer
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
8
 
9
- # Load Mistral-7B Model for Generative Answers
10
- model_name = "google/flan-t5-base" # Smallest alternative, works on free tier
11
-
12
-
13
-
14
  tokenizer = AutoTokenizer.from_pretrained(model_name)
15
- model = AutoModelForCausalLM.from_pretrained(model_name)
16
 
17
- # Load Sentence Embeddings Model for Better Context Matching
18
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
19
 
20
  # Function to Extract & Clean PDF Text
@@ -22,8 +20,10 @@ def extract_clean_text(pdf_path):
22
  text = ""
23
  with pdfplumber.open(pdf_path) as pdf:
24
  for page in pdf.pages:
25
- text += page.extract_text() + "\n"
26
- return text.replace("\n", " ")
 
 
27
 
28
  # Function to Split Text into Chunks
29
  def split_text(text, chunk_size=500):
@@ -32,22 +32,30 @@ def split_text(text, chunk_size=500):
32
 
33
  # Function to Create FAISS Vector Database
34
  def create_faiss_index(chunks):
35
- embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks])
 
 
36
  index = faiss.IndexFlatL2(embeddings.shape[1])
37
  index.add(embeddings)
38
  return index, chunks, embeddings
39
 
40
- # Function to Find Best Matching Chunk
41
  def find_best_chunk(question, index, chunks, embeddings):
42
- question_embedding = embedding_model.encode(question).reshape(1, -1)
 
 
43
  _, closest_idx = index.search(np.array(question_embedding), 1)
44
  return chunks[closest_idx[0][0]]
45
 
46
  # Function to Generate a Long, Detailed Answer
47
  def get_answer(question, context):
48
  input_text = f"Question: {question}\nContext: {context}\nAnswer:"
49
- inputs = tokenizer(input_text, return_tensors="pt")
50
- output = model.generate(**inputs, max_length=300, temperature=0.7)
 
 
 
 
51
  return tokenizer.decode(output[0], skip_special_tokens=True)
52
 
53
  # Streamlit UI
@@ -59,7 +67,10 @@ pdf_text = extract_clean_text(pdf_path)
59
  chunks = split_text(pdf_text)
60
  index, chunks, embeddings = create_faiss_index(chunks)
61
 
62
- st.write("✅ PDF Loaded Successfully!")
 
 
 
63
 
64
  # User Input
65
  question = st.text_input("Ask a question about AWS Restart program:")
 
2
  import pdfplumber
3
  import faiss
4
  import numpy as np
5
+ import torch
6
  from sentence_transformers import SentenceTransformer
 
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
9
 
10
+ # Load Flan-T5 Model for Detailed Answers
11
+ model_name = "google/flan-t5-base" # Small model that works in Hugging Face Spaces
 
 
 
12
  tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
14
 
15
+ # Load Sentence Embeddings Model
16
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
17
 
18
  # Function to Extract & Clean PDF Text
 
20
  text = ""
21
  with pdfplumber.open(pdf_path) as pdf:
22
  for page in pdf.pages:
23
+ extracted_text = page.extract_text()
24
+ if extracted_text: # Only add text if it's not empty
25
+ text += extracted_text + "\n"
26
+ return text.strip() # Remove extra spaces
27
 
28
  # Function to Split Text into Chunks
29
  def split_text(text, chunk_size=500):
 
32
 
33
  # Function to Create FAISS Vector Database
34
  def create_faiss_index(chunks):
35
+ if not chunks:
36
+ return None, None, None # Avoid errors if text extraction fails
37
+ embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks], dtype=np.float32)
38
  index = faiss.IndexFlatL2(embeddings.shape[1])
39
  index.add(embeddings)
40
  return index, chunks, embeddings
41
 
42
+ # Function to Find the Best Matching Chunk
43
  def find_best_chunk(question, index, chunks, embeddings):
44
+ if index is None:
45
+ return "No valid text found in the PDF."
46
+ question_embedding = embedding_model.encode(question).reshape(1, -1).astype(np.float32)
47
  _, closest_idx = index.search(np.array(question_embedding), 1)
48
  return chunks[closest_idx[0][0]]
49
 
50
  # Function to Generate a Long, Detailed Answer
51
  def get_answer(question, context):
52
  input_text = f"Question: {question}\nContext: {context}\nAnswer:"
53
+ inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
54
+
55
+ # Generate response
56
+ with torch.no_grad():
57
+ output = model.generate(**inputs, max_length=300, temperature=0.7)
58
+
59
  return tokenizer.decode(output[0], skip_special_tokens=True)
60
 
61
  # Streamlit UI
 
67
  chunks = split_text(pdf_text)
68
  index, chunks, embeddings = create_faiss_index(chunks)
69
 
70
+ if pdf_text:
71
+ st.write("✅ PDF Loaded Successfully!")
72
+ else:
73
+ st.write("⚠ No valid text found in the PDF. Please check the document format.")
74
 
75
  # User Input
76
  question = st.text_input("Ask a question about AWS Restart program:")