NHZ commited on
Commit
7eecbbb
·
verified ·
1 Parent(s): b8b3983

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -36
app.py CHANGED
@@ -1,11 +1,8 @@
1
  import os
2
  import requests
3
- import numpy as np
4
- import faiss
5
- from PyPDF2 import PdfReader
6
- from sentence_transformers import SentenceTransformer
7
- from transformers import AutoTokenizer, AutoModel
8
  import torch
 
 
9
  from langchain.vectorstores import FAISS
10
  from langchain.chains import RetrievalQA
11
  from langchain.prompts import PromptTemplate
@@ -40,7 +37,7 @@ class GroqLLM(LLM):
40
  data = response.json()
41
  return data["choices"][0]["message"]["content"]
42
 
43
- # Initialize Groq API LLM with explicit API key
44
  llm = GroqLLM(api_key="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")
45
 
46
  # Function to extract content from a public Google Drive PDF link
@@ -60,12 +57,10 @@ def extract_pdf_content(drive_url):
60
  text += page.extract_text()
61
  return text
62
 
63
- # Function to create a FAISS vector store from the document content
64
  def create_vector_store(text):
65
- # Split the text into sentences and clean it
66
  sentences = [sentence.strip() for sentence in text.split(". ") if sentence.strip()]
67
 
68
- # Use Hugging Face transformer model for embeddings
69
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
70
  tokenizer = AutoTokenizer.from_pretrained(model_name)
71
  model = AutoModel.from_pretrained(model_name)
@@ -76,37 +71,28 @@ def create_vector_store(text):
76
  embeddings = model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()
77
  return embeddings
78
 
79
- # Create a FAISS vector store with sentences and their embeddings
80
- vector_store = FAISS.from_texts(
81
- texts=sentences,
82
- embedding_function=embed
83
- )
84
  return vector_store, sentences
85
 
86
  # Streamlit app
87
  st.title("RAG-based Application with Focused Context")
88
 
89
- # Predefined Google Drive link
90
  drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
91
-
92
- # Extract document content
93
- st.write("Extracting content from the document...")
94
  text = extract_pdf_content(drive_url)
 
95
  if text:
96
  st.write("Document extracted successfully!")
97
-
98
- st.write("Creating vector store...")
99
  vector_store, sentences = create_vector_store(text)
100
-
101
- st.write("Vector store created successfully!")
102
 
103
  query = st.text_input("Enter your query:")
104
  if query:
105
- st.write("Retrieving relevant context from the document...")
106
  retriever = vector_store.as_retriever()
107
- retriever.search_kwargs["k"] = 3 # Retrieve top 3 matches
108
 
109
- # Define a prompt template to guide LLM response generation
110
  prompt_template = PromptTemplate(
111
  template="""
112
  Use the following context to answer the question:
@@ -118,25 +104,16 @@ if text:
118
  input_variables=["context", "question"]
119
  )
120
 
121
- # Create a RetrievalQA chain
122
  qa_chain = RetrievalQA.from_chain_type(
123
  retriever=retriever,
124
  llm=llm,
125
- chain_type="stuff", # Use the default chain type
126
- return_source_documents=True # Optional
127
  )
128
 
129
- # Run the query through the QA chain and get the outputs
130
  response = qa_chain({"query": query})
131
  answer = response["result"]
132
 
133
- # Display the result
134
  st.write("Answer:", answer)
135
-
136
- # Optionally display the source documents
137
- if "source_documents" in response:
138
- st.write("Source Documents:")
139
- for doc in response["source_documents"]:
140
- st.write(doc.page_content)
141
  else:
142
  st.error("Failed to extract content from the document.")
 
1
  import os
2
  import requests
 
 
 
 
 
3
  import torch
4
+ from transformers import AutoTokenizer, AutoModel
5
+ from PyPDF2 import PdfReader
6
  from langchain.vectorstores import FAISS
7
  from langchain.chains import RetrievalQA
8
  from langchain.prompts import PromptTemplate
 
37
  data = response.json()
38
  return data["choices"][0]["message"]["content"]
39
 
40
+ # Initialize Groq API LLM
41
  llm = GroqLLM(api_key="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")
42
 
43
  # Function to extract content from a public Google Drive PDF link
 
57
  text += page.extract_text()
58
  return text
59
 
60
+ # Function to create a FAISS vector store
61
  def create_vector_store(text):
 
62
  sentences = [sentence.strip() for sentence in text.split(". ") if sentence.strip()]
63
 
 
64
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
65
  tokenizer = AutoTokenizer.from_pretrained(model_name)
66
  model = AutoModel.from_pretrained(model_name)
 
71
  embeddings = model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()
72
  return embeddings
73
 
74
+ embeddings = [embed(sentence) for sentence in sentences]
75
+ text_embeddings = [(sentences[i], embeddings[i]) for i in range(len(sentences))]
76
+ vector_store = FAISS.from_embeddings(text_embeddings)
77
+
 
78
  return vector_store, sentences
79
 
80
  # Streamlit app
81
  st.title("RAG-based Application with Focused Context")
82
 
 
83
  drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
 
 
 
84
  text = extract_pdf_content(drive_url)
85
+
86
  if text:
87
  st.write("Document extracted successfully!")
 
 
88
  vector_store, sentences = create_vector_store(text)
89
+ st.write("Vector store created!")
 
90
 
91
  query = st.text_input("Enter your query:")
92
  if query:
 
93
  retriever = vector_store.as_retriever()
94
+ retriever.search_kwargs["k"] = 3
95
 
 
96
  prompt_template = PromptTemplate(
97
  template="""
98
  Use the following context to answer the question:
 
104
  input_variables=["context", "question"]
105
  )
106
 
 
107
  qa_chain = RetrievalQA.from_chain_type(
108
  retriever=retriever,
109
  llm=llm,
110
+ chain_type="stuff",
111
+ return_source_documents=True
112
  )
113
 
 
114
  response = qa_chain({"query": query})
115
  answer = response["result"]
116
 
 
117
  st.write("Answer:", answer)
 
 
 
 
 
 
118
  else:
119
  st.error("Failed to extract content from the document.")