sreebhargav commited on
Commit
e747eba
Β·
verified Β·
1 Parent(s): abc7493

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -27
app.py CHANGED
@@ -9,16 +9,11 @@ from langchain_google_genai import ChatGoogleGenerativeAI
9
  from langchain.chains.question_answering import load_qa_chain
10
  from langchain.prompts import PromptTemplate
11
  from dotenv import load_dotenv
12
- from pdf2image import convert_from_bytes
13
- import pytesseract
14
- from io import BytesIO
15
 
16
  # Load API key
17
  load_dotenv()
18
  genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
19
 
20
- os.environ["STREAMLIT_CONFIG_DIR"] = "/app/.streamlit"
21
-
22
  # Inject CSS for chat bubbles
23
  st.markdown("""
24
  <style>
@@ -43,38 +38,27 @@ st.markdown("""
43
  </style>
44
  """, unsafe_allow_html=True)
45
 
46
- # Extract text (with OCR fallback)
47
  def get_pdf_text(pdf_docs):
48
- all_text = ""
49
  for pdf in pdf_docs:
50
- pdf_bytes = pdf.read()
51
- pdf_reader = PdfReader(BytesIO(pdf_bytes))
52
- text = ""
53
-
54
  for page in pdf_reader.pages:
55
  page_text = page.extract_text()
56
  if page_text:
57
  text += page_text
58
-
59
- if not text.strip():
60
- st.warning(f"OCR applied to '{pdf.name}' (scanned or image-based PDF).")
61
- images = convert_from_bytes(pdf_bytes)
62
- for i, img in enumerate(images):
63
- st.image(img, caption=f"OCR Page {i+1}", use_column_width=True)
64
- text += pytesseract.image_to_string(img)
65
-
66
- all_text += text
67
- return all_text
68
 
69
  def get_text_chunks(text):
70
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
71
- return text_splitter.split_text(text)
 
72
 
73
  def get_vector_store(text_chunks):
74
  if not text_chunks:
75
  raise ValueError("No text chunks to embed. Check if your PDF contains extractable text.")
76
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
77
- return FAISS.from_texts(text_chunks, embedding=embeddings)
 
78
 
79
  def get_conversational_chain():
80
  prompt_template = """
@@ -87,18 +71,20 @@ def get_conversational_chain():
87
  """
88
  model = ChatGoogleGenerativeAI(model="models/gemini-2.0-flash", temperature=0.3)
89
  prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
90
- return load_qa_chain(model, chain_type="stuff", prompt=prompt)
 
91
 
92
  def display_chat(user_msg, bot_msg):
93
  st.markdown(f"<div class='chat-bubble user'>{user_msg}</div>", unsafe_allow_html=True)
94
  st.markdown(f"<div class='chat-bubble bot'>{bot_msg}</div>", unsafe_allow_html=True)
95
 
96
  def main():
97
- st.set_page_config(page_title="Chat with PDFs (Text + Scanned)", layout="wide")
98
- st.title("πŸ“š Chat with Your PDFs using Gemini + OCR")
99
 
100
  col1, col2 = st.columns([1, 2], gap="large")
101
 
 
102
  with col1:
103
  st.header("πŸ“ Upload & Process")
104
  pdf_docs = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
@@ -111,7 +97,7 @@ def main():
111
  raw_text = get_pdf_text(pdf_docs)
112
 
113
  if not raw_text.strip():
114
- st.error("❗ No text could be extracted, even with OCR.")
115
  return
116
 
117
  text_chunks = get_text_chunks(raw_text)
@@ -129,6 +115,7 @@ def main():
129
  except Exception as e:
130
  st.error(f"❗ Error creating vector store: {str(e)}")
131
 
 
132
  with col2:
133
  st.header("πŸ’¬ Ask Questions")
134
  user_question = st.text_input("Type your question here...")
 
9
  from langchain.chains.question_answering import load_qa_chain
10
  from langchain.prompts import PromptTemplate
11
  from dotenv import load_dotenv
 
 
 
12
 
13
  # Load API key
14
  load_dotenv()
15
  genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
16
 
 
 
17
  # Inject CSS for chat bubbles
18
  st.markdown("""
19
  <style>
 
38
  </style>
39
  """, unsafe_allow_html=True)
40
 
 
41
  def get_pdf_text(pdf_docs):
42
+ text = ""
43
  for pdf in pdf_docs:
44
+ pdf_reader = PdfReader(pdf)
 
 
 
45
  for page in pdf_reader.pages:
46
  page_text = page.extract_text()
47
  if page_text:
48
  text += page_text
49
+ return text
 
 
 
 
 
 
 
 
 
50
 
51
  def get_text_chunks(text):
52
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
53
+ chunks = text_splitter.split_text(text)
54
+ return chunks
55
 
56
  def get_vector_store(text_chunks):
57
  if not text_chunks:
58
  raise ValueError("No text chunks to embed. Check if your PDF contains extractable text.")
59
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
60
+ vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
61
+ return vector_store
62
 
63
  def get_conversational_chain():
64
  prompt_template = """
 
71
  """
72
  model = ChatGoogleGenerativeAI(model="models/gemini-2.0-flash", temperature=0.3)
73
  prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
74
+ chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
75
+ return chain
76
 
77
  def display_chat(user_msg, bot_msg):
78
  st.markdown(f"<div class='chat-bubble user'>{user_msg}</div>", unsafe_allow_html=True)
79
  st.markdown(f"<div class='chat-bubble bot'>{bot_msg}</div>", unsafe_allow_html=True)
80
 
81
  def main():
82
+ st.set_page_config(page_title="Chat with PDFs", layout="wide")
83
+ st.title("πŸ“š Chat with Your PDFs using Gemini")
84
 
85
  col1, col2 = st.columns([1, 2], gap="large")
86
 
87
+ # LEFT: Upload PDFs
88
  with col1:
89
  st.header("πŸ“ Upload & Process")
90
  pdf_docs = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
 
97
  raw_text = get_pdf_text(pdf_docs)
98
 
99
  if not raw_text.strip():
100
+ st.error("❗ No extractable text found in the uploaded PDFs. They might be scanned images.")
101
  return
102
 
103
  text_chunks = get_text_chunks(raw_text)
 
115
  except Exception as e:
116
  st.error(f"❗ Error creating vector store: {str(e)}")
117
 
118
+ # RIGHT: Ask Questions
119
  with col2:
120
  st.header("πŸ’¬ Ask Questions")
121
  user_question = st.text_input("Type your question here...")