ChatWith.PDF

Sleeping

App Files Files Community

theerasin commited on May 7, 2025

Commit

3d4a265

verified ·

1 Parent(s): 5aefeb3

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -145

app.py CHANGED Viewed

@@ -1,166 +1,110 @@
 import streamlit as st
-from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
-from pydantic import BaseModel, Field
-from typing import List
-from datetime import datetime
-import PyPDF2
-from fpdf import FPDF
-from docx import Document
-import io
-from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_core.documents import Document as LCDocument
-import time
-# === Summarization model ===
 summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-# === QA model ===
-qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
-qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
-qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)
 # === Embedding model ===
-from sentence_transformers import SentenceTransformer
-from langchain.embeddings import HuggingFaceEmbeddings
-embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
-embedding_function = HuggingFaceEmbeddings(model=embedding_model)
-# === Data models ===
-class KeyPoint(BaseModel):
-    point: str = Field(description="A key point extracted from the document.")
 class Summary(BaseModel):
-    summary: str = Field(description="A brief summary of the document content.")
 class DocumentAnalysis(BaseModel):
-    key_points: List[KeyPoint]
     summary: Summary
-def extract_text_from_pdf(pdf_file):
-    pdf_reader = PyPDF2.PdfReader(pdf_file)
-    return "".join(page.extract_text() for page in pdf_reader.pages)
 def analyze_text_structured(text):
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     chunks = splitter.split_text(text)
-    summaries = []
-    for chunk in chunks:
-        try:
-            result = summarizer(chunk, max_length=200, min_length=50, do_sample=False)
-            summaries.append(result[0]["summary_text"])
-        except Exception:
-            summaries.append("")
-    full_summary = " ".join(summaries)
-    key_points = [KeyPoint(point=line.strip()) for line in full_summary.split(". ") if line.strip()]
-    return DocumentAnalysis(summary=Summary(summary=full_summary), key_points=key_points)
-def json_to_text(analysis):
-    text_output = "=== Summary ===\n" + f"{analysis.summary.summary}\n\n"
-    text_output += "=== Key Points ===\n"
-    for i, key_point in enumerate(analysis.key_points, start=1):
-        text_output += f"{i}. {key_point.point}\n"
-    return text_output
-def create_pdf_report(analysis):
-    pdf = FPDF()
-    pdf.add_page()
-    pdf.set_font('Helvetica', '', 12)
-    pdf.cell(200, 10, txt="PDF Analysis Report", ln=True, align='C')
-    pdf.cell(200, 10, txt=f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=True, align='C')
-    pdf.multi_cell(0, 10, txt=json_to_text(analysis))
-    pdf_bytes = io.BytesIO()
-    pdf.output(pdf_bytes, dest='S')
-    pdf_bytes.seek(0)
-    return pdf_bytes.getvalue()
-def create_word_report(analysis):
-    doc = Document()
-    doc.add_heading('PDF Analysis Report', 0)
-    doc.add_paragraph(f'Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
-    doc.add_heading('Analysis', level=1)
-    doc.add_paragraph(json_to_text(analysis))
-    docx_bytes = io.BytesIO()
-    doc.save(docx_bytes)
-    docx_bytes.seek(0)
-    return docx_bytes.getvalue()
 # === Streamlit UI ===
-st.set_page_config(page_title="Chat With PDF (BART + BGE)", page_icon="📄")
-st.title("📄 Chat With PDF")
-st.caption("Summarize and Chat with Documents using facebook/bart-large-cnn + BGE-small Embeddings + RoBERTa QA")
-for key in ["current_file", "pdf_summary", "analysis_time", "pdf_report", "word_report", "vectorstore", "messages"]:
-    if key not in st.session_state:
-        st.session_state[key] = None if key != "messages" else []
-uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
-if uploaded_file is not None:
-    if st.session_state.current_file != uploaded_file.name:
-        st.session_state.current_file = uploaded_file.name
-        for key in ["pdf_summary", "pdf_report", "word_report", "vectorstore", "messages"]:
-            st.session_state[key] = None if key != "messages" else []
-    text = extract_text_from_pdf(uploaded_file)
-    if st.button("Analyze Text"):
-        start_time = time.time()
-        with st.spinner("Analyzing..."):
-            analysis = analyze_text_structured(text)
-            st.session_state.pdf_summary = analysis
-            splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-            chunks = splitter.split_text(text)
-            docs = [LCDocument(page_content=chunk) for chunk in chunks]
-            st.session_state.vectorstore = FAISS.from_documents(docs, embedding_function)
-            st.session_state.pdf_report = create_pdf_report(analysis)
-            st.session_state.word_report = create_word_report(analysis)
-        st.session_state.analysis_time = time.time() - start_time
-        st.subheader("Analysis Results")
-        st.text(json_to_text(analysis))
-        col1, col2 = st.columns(2)
-        with col1:
-            st.download_button(
-                label="Download PDF Report",
-                data=st.session_state.pdf_report,
-                file_name="analysis_report.pdf",
-                mime="application/pdf"
-            )
-        with col2:
-            st.download_button(
-                label="Download Word Report",
-                data=st.session_state.word_report,
-                file_name="analysis_report.docx",
-                mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-            )
-if st.session_state.vectorstore is not None:
-    st.subheader("Chat with the Document")
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
-    if prompt := st.chat_input("Ask a question about the document"):
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        with st.chat_message("user"):
-            st.markdown(prompt)
-        with st.chat_message("assistant"):
-            with st.spinner("Searching..."):
-                docs = st.session_state.vectorstore.similarity_search(prompt, k=3)
-                context = "\n".join([doc.page_content for doc in docs])
-                answer = qa_pipeline({"question": prompt, "context": context})["answer"]
-                st.markdown(answer)
-        st.session_state.messages.append({"role": "assistant", "content": answer})
-if st.session_state.analysis_time is not None:
-    st.markdown(
-        f'<div style="text-align:center; margin-top:2rem; color:gray;">Analysis Time: {st.session_state.analysis_time:.1f}s | Embedding: BGE-small v1.5 | QA: RoBERTa-SQuAD2</div>',
-        unsafe_allow_html=True
-    )

 import streamlit as st
+from transformers import pipeline
 from langchain_community.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document as LCDocument
+import PyPDF2
+from docx import Document as DocxDocument
+import io
+from typing import List
+from pydantic import BaseModel
+import tempfile
+# === Summarizer ===
 summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+# === QA Model ===
+qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
 # === Embedding model ===
+embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
+# === Pydantic Models ===
 class Summary(BaseModel):
+    summary: str
+class KeyPoint(BaseModel):
+    point: str
 class DocumentAnalysis(BaseModel):
     summary: Summary
+    key_points: List[KeyPoint]
+# === Loaders ===
+def load_pdf(file):
+    reader = PyPDF2.PdfReader(file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+def load_docx(file):
+    doc = DocxDocument(file)
+    return "\n".join([para.text for para in doc.paragraphs])
+# === Analysis ===
 def analyze_text_structured(text):
+    result = summarizer(text, max_length=200, min_length=50, do_sample=False)[0]["summary_text"]
+    key_points = [KeyPoint(point=line.strip()) for line in result.split(". ") if line.strip()]
+    return DocumentAnalysis(summary=Summary(summary=result), key_points=key_points)
+# === Embedding & Retrieval ===
+def get_vectorstore_from_text(text):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
     chunks = splitter.split_text(text)
+    docs = [LCDocument(page_content=chunk) for chunk in chunks]
+    return FAISS.from_documents(docs, embedding_function)
+def answer_question(vectorstore, question):
+    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
+    docs = retriever.get_relevant_documents(question)
+    context = "\n".join([doc.page_content for doc in docs])
+    result = qa_pipeline(question=question, context=context)
+    return result["answer"]
 # === Streamlit UI ===
+st.title("📄 AI Document Analyzer")
+uploaded_file = st.file_uploader("Upload a document (PDF or DOCX)", type=["pdf", "docx"])
+input_text = st.text_area("Or paste your text here", height=200)
+if st.button("Analyze"):
+    if uploaded_file:
+        file_bytes = uploaded_file.read()
+        file_ext = uploaded_file.name.split(".")[-1]
+        if file_ext == "pdf":
+            text = load_pdf(io.BytesIO(file_bytes))
+        elif file_ext == "docx":
+            text = load_docx(io.BytesIO(file_bytes))
+        else:
+            st.error("Unsupported file format.")
+            st.stop()
+    elif input_text:
+        text = input_text
+    else:
+        st.warning("Please upload a file or paste text.")
+        st.stop()
+    with st.spinner("Analyzing..."):
+        analysis = analyze_text_structured(text)
+        vectorstore = get_vectorstore_from_text(text)
+    st.subheader("🔍 Summary")
+    st.write(analysis.summary.summary)
+    st.subheader("📌 Key Points")
+    for point in analysis.key_points:
+        st.markdown(f"- {point.point}")
+    st.subheader("❓ Ask a Question")
+    user_question = st.text_input("What do you want to know?")
+    if user_question:
+        with st.spinner("Searching for an answer..."):
+            answer = answer_question(vectorstore, user_question)
+        st.success(f"💬 Answer: {answer}")