Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,16 +9,11 @@ from langchain_google_genai import ChatGoogleGenerativeAI
|
|
| 9 |
from langchain.chains.question_answering import load_qa_chain
|
| 10 |
from langchain.prompts import PromptTemplate
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
-
from pdf2image import convert_from_bytes
|
| 13 |
-
import pytesseract
|
| 14 |
-
from io import BytesIO
|
| 15 |
|
| 16 |
# Load API key
|
| 17 |
load_dotenv()
|
| 18 |
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
| 19 |
|
| 20 |
-
os.environ["STREAMLIT_CONFIG_DIR"] = "/app/.streamlit"
|
| 21 |
-
|
| 22 |
# Inject CSS for chat bubbles
|
| 23 |
st.markdown("""
|
| 24 |
<style>
|
|
@@ -43,38 +38,27 @@ st.markdown("""
|
|
| 43 |
</style>
|
| 44 |
""", unsafe_allow_html=True)
|
| 45 |
|
| 46 |
-
# Extract text (with OCR fallback)
|
| 47 |
def get_pdf_text(pdf_docs):
|
| 48 |
-
|
| 49 |
for pdf in pdf_docs:
|
| 50 |
-
|
| 51 |
-
pdf_reader = PdfReader(BytesIO(pdf_bytes))
|
| 52 |
-
text = ""
|
| 53 |
-
|
| 54 |
for page in pdf_reader.pages:
|
| 55 |
page_text = page.extract_text()
|
| 56 |
if page_text:
|
| 57 |
text += page_text
|
| 58 |
-
|
| 59 |
-
if not text.strip():
|
| 60 |
-
st.warning(f"OCR applied to '{pdf.name}' (scanned or image-based PDF).")
|
| 61 |
-
images = convert_from_bytes(pdf_bytes)
|
| 62 |
-
for i, img in enumerate(images):
|
| 63 |
-
st.image(img, caption=f"OCR Page {i+1}", use_column_width=True)
|
| 64 |
-
text += pytesseract.image_to_string(img)
|
| 65 |
-
|
| 66 |
-
all_text += text
|
| 67 |
-
return all_text
|
| 68 |
|
| 69 |
def get_text_chunks(text):
|
| 70 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
|
| 71 |
-
|
|
|
|
| 72 |
|
| 73 |
def get_vector_store(text_chunks):
|
| 74 |
if not text_chunks:
|
| 75 |
raise ValueError("No text chunks to embed. Check if your PDF contains extractable text.")
|
| 76 |
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
| 77 |
-
|
|
|
|
| 78 |
|
| 79 |
def get_conversational_chain():
|
| 80 |
prompt_template = """
|
|
@@ -87,18 +71,20 @@ def get_conversational_chain():
|
|
| 87 |
"""
|
| 88 |
model = ChatGoogleGenerativeAI(model="models/gemini-2.0-flash", temperature=0.3)
|
| 89 |
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
|
| 90 |
-
|
|
|
|
| 91 |
|
| 92 |
def display_chat(user_msg, bot_msg):
|
| 93 |
st.markdown(f"<div class='chat-bubble user'>{user_msg}</div>", unsafe_allow_html=True)
|
| 94 |
st.markdown(f"<div class='chat-bubble bot'>{bot_msg}</div>", unsafe_allow_html=True)
|
| 95 |
|
| 96 |
def main():
|
| 97 |
-
st.set_page_config(page_title="Chat with PDFs
|
| 98 |
-
st.title("π Chat with Your PDFs using Gemini
|
| 99 |
|
| 100 |
col1, col2 = st.columns([1, 2], gap="large")
|
| 101 |
|
|
|
|
| 102 |
with col1:
|
| 103 |
st.header("π Upload & Process")
|
| 104 |
pdf_docs = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
|
|
@@ -111,7 +97,7 @@ def main():
|
|
| 111 |
raw_text = get_pdf_text(pdf_docs)
|
| 112 |
|
| 113 |
if not raw_text.strip():
|
| 114 |
-
st.error("β No text
|
| 115 |
return
|
| 116 |
|
| 117 |
text_chunks = get_text_chunks(raw_text)
|
|
@@ -129,6 +115,7 @@ def main():
|
|
| 129 |
except Exception as e:
|
| 130 |
st.error(f"β Error creating vector store: {str(e)}")
|
| 131 |
|
|
|
|
| 132 |
with col2:
|
| 133 |
st.header("π¬ Ask Questions")
|
| 134 |
user_question = st.text_input("Type your question here...")
|
|
|
|
| 9 |
from langchain.chains.question_answering import load_qa_chain
|
| 10 |
from langchain.prompts import PromptTemplate
|
| 11 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Load API key
|
| 14 |
load_dotenv()
|
| 15 |
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
| 16 |
|
|
|
|
|
|
|
| 17 |
# Inject CSS for chat bubbles
|
| 18 |
st.markdown("""
|
| 19 |
<style>
|
|
|
|
| 38 |
</style>
|
| 39 |
""", unsafe_allow_html=True)
|
| 40 |
|
|
|
|
| 41 |
def get_pdf_text(pdf_docs):
|
| 42 |
+
text = ""
|
| 43 |
for pdf in pdf_docs:
|
| 44 |
+
pdf_reader = PdfReader(pdf)
|
|
|
|
|
|
|
|
|
|
| 45 |
for page in pdf_reader.pages:
|
| 46 |
page_text = page.extract_text()
|
| 47 |
if page_text:
|
| 48 |
text += page_text
|
| 49 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
def get_text_chunks(text):
|
| 52 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
|
| 53 |
+
chunks = text_splitter.split_text(text)
|
| 54 |
+
return chunks
|
| 55 |
|
| 56 |
def get_vector_store(text_chunks):
|
| 57 |
if not text_chunks:
|
| 58 |
raise ValueError("No text chunks to embed. Check if your PDF contains extractable text.")
|
| 59 |
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
| 60 |
+
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
|
| 61 |
+
return vector_store
|
| 62 |
|
| 63 |
def get_conversational_chain():
|
| 64 |
prompt_template = """
|
|
|
|
| 71 |
"""
|
| 72 |
model = ChatGoogleGenerativeAI(model="models/gemini-2.0-flash", temperature=0.3)
|
| 73 |
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
|
| 74 |
+
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
|
| 75 |
+
return chain
|
| 76 |
|
| 77 |
def display_chat(user_msg, bot_msg):
|
| 78 |
st.markdown(f"<div class='chat-bubble user'>{user_msg}</div>", unsafe_allow_html=True)
|
| 79 |
st.markdown(f"<div class='chat-bubble bot'>{bot_msg}</div>", unsafe_allow_html=True)
|
| 80 |
|
| 81 |
def main():
|
| 82 |
+
st.set_page_config(page_title="Chat with PDFs", layout="wide")
|
| 83 |
+
st.title("π Chat with Your PDFs using Gemini")
|
| 84 |
|
| 85 |
col1, col2 = st.columns([1, 2], gap="large")
|
| 86 |
|
| 87 |
+
# LEFT: Upload PDFs
|
| 88 |
with col1:
|
| 89 |
st.header("π Upload & Process")
|
| 90 |
pdf_docs = st.file_uploader("Upload PDF files", type=["pdf"], accept_multiple_files=True)
|
|
|
|
| 97 |
raw_text = get_pdf_text(pdf_docs)
|
| 98 |
|
| 99 |
if not raw_text.strip():
|
| 100 |
+
st.error("β No extractable text found in the uploaded PDFs. They might be scanned images.")
|
| 101 |
return
|
| 102 |
|
| 103 |
text_chunks = get_text_chunks(raw_text)
|
|
|
|
| 115 |
except Exception as e:
|
| 116 |
st.error(f"β Error creating vector store: {str(e)}")
|
| 117 |
|
| 118 |
+
# RIGHT: Ask Questions
|
| 119 |
with col2:
|
| 120 |
st.header("π¬ Ask Questions")
|
| 121 |
user_question = st.text_input("Type your question here...")
|