stevafernandes commited on
Commit
ebc3422
·
verified ·
1 Parent(s): b1faf2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -55
app.py CHANGED
@@ -1,13 +1,14 @@
 
 
 
1
  import asyncio
 
 
2
  try:
3
  asyncio.get_running_loop()
4
  except RuntimeError:
5
  asyncio.set_event_loop(asyncio.new_event_loop())
6
 
7
- import streamlit as st
8
- from PyPDF2 import PdfReader
9
- import os
10
-
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
13
  from langchain_community.vectorstores import FAISS
@@ -15,79 +16,100 @@ from langchain_google_genai import ChatGoogleGenerativeAI
15
  from langchain.chains.question_answering import load_qa_chain
16
  from langchain.prompts import PromptTemplate
17
 
 
18
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
19
- PDF_PATH = "librarianship.pdf"
20
- INDEX_PATH = "faiss_index" # CHANGED from /tmp/faiss_index to /data/faiss_index
21
 
22
- def get_pdf_text(pdf_path):
23
- text = ""
 
24
  reader = PdfReader(pdf_path)
25
- for page in reader.pages:
26
- page_text = page.extract_text()
27
- if page_text:
28
- text += page_text
29
- return text
30
 
31
- def get_text_chunks(text):
32
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
33
- return text_splitter.split_text(text)
34
 
35
  def build_and_save_vector_store(text_chunks, api_key):
36
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
 
 
37
  vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
38
- vector_store.save_local(INDEX_PATH)
39
 
40
- @st.cache_resource(show_spinner=False)
41
  def load_vector_store(api_key):
42
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
43
- return FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
 
 
 
 
 
 
 
44
 
45
- @st.cache_resource(show_spinner=False)
46
  def get_conversational_chain(api_key):
47
- prompt_template = """
48
- You are a helpful assistant that only answers based on the context provided from the PDF document.
49
- Do not use any external knowledge or assumptions. If the answer is not found in the context below, reply with "I don't know."
50
- Context:
51
- {context}
52
- Question:
53
- {question}
54
- Answer:
55
- """
56
- model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0, google_api_key=api_key)
57
- prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
58
- chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
59
- return chain
60
-
61
- def user_input(user_question, api_key):
62
- db = load_vector_store(api_key)
63
- docs = db.similarity_search(user_question)
64
- chain = get_conversational_chain(api_key)
65
- response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
66
- st.write("**Reply:**", response["output_text"])
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
 
68
  def main():
69
- st.set_page_config(page_title="Chat with librarianship")
70
  st.header("Librarianship AI Application (Gemini 2.0)")
71
  st.markdown("---")
72
 
73
- # --- Ensure API key is present ---
74
  if not GOOGLE_API_KEY:
75
- st.error("Please set the GOOGLE_API_KEY environment variable in your Hugging Face Space secrets or .env file.")
 
 
 
76
  st.stop()
77
 
78
- # --- Build FAISS index if not present ---
79
- if not os.path.exists(INDEX_PATH + ".index"):
80
- with st.spinner(f"Indexing {PDF_PATH}..."):
81
- raw_text = get_pdf_text(PDF_PATH)
82
- text_chunks = get_text_chunks(raw_text)
83
  build_and_save_vector_store(text_chunks, GOOGLE_API_KEY)
84
- st.success(f"Indexed {PDF_PATH}. You can now ask questions.")
85
 
86
- # --- Ask questions UI ---
87
  st.subheader("Ask a question about librarianship")
88
- user_question = st.text_input("Ask a question")
89
- if user_question:
90
- user_input(user_question, GOOGLE_API_KEY)
91
 
92
  if __name__ == "__main__":
93
  main()
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ import os
4
  import asyncio
5
+
6
+ # ------------ Ensure an asyncio loop (required by Gemini libs) ------------
7
  try:
8
  asyncio.get_running_loop()
9
  except RuntimeError:
10
  asyncio.set_event_loop(asyncio.new_event_loop())
11
 
 
 
 
 
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
14
  from langchain_community.vectorstores import FAISS
 
16
  from langchain.chains.question_answering import load_qa_chain
17
  from langchain.prompts import PromptTemplate
18
 
19
+ # --------------------------- CONFIG ---------------------------------------
20
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
21
+ PDF_PATH = "librarianship.pdf"
22
+ INDEX_DIR = "/tmp/faiss_index" # FAISS saves as INDEX_DIR.index
23
 
24
+ # --------------------- Helper functions -----------------------------------
25
+ def get_pdf_text(pdf_path: str) -> str:
26
+ """Extract full text from a single PDF."""
27
  reader = PdfReader(pdf_path)
28
+ return "".join(page.extract_text() or "" for page in reader.pages)
 
 
 
 
29
 
30
+ def get_text_chunks(text: str):
31
+ splitter = RecursiveCharacterTextSplitter(chunk_size=10_000, chunk_overlap=1_000)
32
+ return splitter.split_text(text)
33
 
34
  def build_and_save_vector_store(text_chunks, api_key):
35
+ embeddings = GoogleGenerativeAIEmbeddings(
36
+ model="models/embedding-001", google_api_key=api_key
37
+ )
38
  vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
39
+ vector_store.save_local(INDEX_DIR)
40
 
 
41
  def load_vector_store(api_key):
42
+ """Load FAISS index from disk once and keep in session memory."""
43
+ if "vector_store" not in st.session_state:
44
+ embeddings = GoogleGenerativeAIEmbeddings(
45
+ model="models/embedding-001", google_api_key=api_key
46
+ )
47
+ st.session_state.vector_store = FAISS.load_local(
48
+ INDEX_DIR, embeddings, allow_dangerous_deserialization=True
49
+ )
50
+ return st.session_state.vector_store
51
 
 
52
  def get_conversational_chain(api_key):
53
+ """Create the QA chain once and reuse it."""
54
+ if "qa_chain" not in st.session_state:
55
+ prompt_template = """
56
+ You are a helpful assistant that only answers based on the context provided
57
+ from the PDF document below. If the answer is not in the context, reply with
58
+ "I don't know."
59
+
60
+ Context:
61
+ {context}
62
+
63
+ Question:
64
+ {question}
65
+
66
+ Answer:
67
+ """
68
+ model = ChatGoogleGenerativeAI(
69
+ model="gemini-2.0-flash", temperature=0, google_api_key=api_key
70
+ )
71
+ prompt = PromptTemplate(
72
+ template=prompt_template, input_variables=["context", "question"]
73
+ )
74
+ st.session_state.qa_chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
75
+ return st.session_state.qa_chain
76
+
77
+ def answer_question(question, api_key):
78
+ db = load_vector_store(api_key)
79
+ docs = db.similarity_search(question)
80
+ chain = get_conversational_chain(api_key)
81
+ result = chain(
82
+ {"input_documents": docs, "question": question},
83
+ return_only_outputs=True,
84
+ )
85
+ st.write("**Reply:**", result["output_text"])
86
 
87
+ # ----------------------------- MAIN APP ------------------------------------
88
  def main():
89
+ st.set_page_config(page_title="Chat librarianship.pdf")
90
  st.header("Librarianship AI Application (Gemini 2.0)")
91
  st.markdown("---")
92
 
93
+ # --- 0. Check API key ---------------------------------------------------
94
  if not GOOGLE_API_KEY:
95
+ st.error(
96
+ "Please set the GOOGLE_API_KEY environment variable "
97
+ "in your Hugging Face Space secrets or .env file."
98
+ )
99
  st.stop()
100
 
101
+ # --- 1. Build FAISS index (only if missing) ----------------------------
102
+ if not os.path.exists(INDEX_DIR + ".index"):
103
+ with st.spinner(f"Indexing {PDF_PATH}"):
104
+ text_chunks = get_text_chunks(get_pdf_text(PDF_PATH))
 
105
  build_and_save_vector_store(text_chunks, GOOGLE_API_KEY)
106
+ st.success("Index built! Ask away 👇")
107
 
108
+ # --- 2. Chat UI --------------------------------------------------------
109
  st.subheader("Ask a question about librarianship")
110
+ q = st.text_input("Type your question here")
111
+ if q:
112
+ answer_question(q, GOOGLE_API_KEY)
113
 
114
  if __name__ == "__main__":
115
  main()