Spaces:

waqasbm
/

Data_Extractor_Tool

Sleeping

App Files Files Community

waqasbm commited on May 19, 2025

Commit

27d2624

verified ·

1 Parent(s): 8b7511d

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -79

app.py CHANGED Viewed

@@ -1,30 +1,34 @@
 import streamlit as st
 import fitz  # PyMuPDF
-import requests
 import os
 from dotenv import load_dotenv
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from keybert import KeyBERT
 from textblob import TextBlob
-# Setup
 load_dotenv()
-GROQ_API_KEY = os.getenv("wbm1")
-GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
 GROQ_MODEL = "llama3-8b-8192"
-st.set_page_config(page_title="🧠 Smart PDF Extractor", layout="centered")
-st.title("📄 Smart PDF Extractor & AI Summarizer")
 st.markdown("""
-Extract summaries, insights, keywords, and sentiment from your PDFs using AI.
 """)
-uploaded_file = st.file_uploader("📁 Upload your PDF file", type=["pdf"])
-# ---------- Utilities ----------
 def extract_text_from_pdf(file):
     doc = fitz.open(stream=file.read(), filetype="pdf")
     text = ""
@@ -32,38 +36,42 @@ def extract_text_from_pdf(file):
         text += page.get_text()
     return text
-def split_text_langchain(text, chunk_size=3000, chunk_overlap=200):
-    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    chunks = splitter.split_text(text)
-    return chunks
-def summarize_chunk(text, prompt):
-    headers = {
-        "Authorization": f"Bearer {GROQ_API_KEY}",
-        "Content-Type": "application/json"
-    }
-    payload = {
-        "model": GROQ_MODEL,
-        "messages": [
-            {"role": "system", "content": prompt},
-            {"role": "user", "content": text}
-        ],
-        "temperature": 0.3,
-        "max_tokens": 1024
-    }
-    response = requests.post(GROQ_API_URL, headers=headers, json=payload)
-    response.raise_for_status()
-    return response.json()["choices"][0]["message"]["content"]
 def extract_keywords(text, top_n=10):
     kw_model = KeyBERT()
     keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
     return [kw[0] for kw in keywords]
 def get_sentiment(text):
     blob = TextBlob(text)
     polarity = blob.sentiment.polarity
@@ -74,48 +82,47 @@ def get_sentiment(text):
     else:
         return "😐 Neutral"
 def make_download_button(text, filename="summary.txt"):
     st.download_button("💾 Download Summary", data=text, file_name=filename, mime="text/plain")
-# ---------- Main Logic ----------
-if uploaded_file:
-    with st.spinner("🧠 Reading and analyzing PDF..."):
-        pdf_text = extract_text_from_pdf(uploaded_file)
-        chunks = split_text_langchain(pdf_text)
-        prompt = (
-            "Summarize the following text clearly. Focus on main ideas, insights, data points, and useful information."
-        )
-        summaries = []
-        for i, chunk in enumerate(chunks):
-            st.write(f"⏳ Summarizing part {i + 1}/{len(chunks)}...")
-            try:
-                summary = summarize_chunk(chunk, prompt)
-                summaries.append(summary)
-            except Exception as e:
-                st.error(f"Error summarizing chunk {i + 1}: {e}")
-                break
-        if summaries:
-            final_summary = "\n\n".join(summaries)
-            st.subheader("✅ Final Summary")
-            st.success(final_summary)
-            make_download_button(final_summary)
-            st.markdown("---")
-            st.subheader("🔑 Keywords")
-            keywords = extract_keywords(final_summary)
-            st.write(", ".join(keywords))
-            st.subheader("📊 Sentiment")
-            sentiment = get_sentiment(final_summary)
-            st.write(sentiment)
 else:
-    st.info("📥 Upload a PDF to begin.")

 import streamlit as st
 import fitz  # PyMuPDF
 import os
+import time
+import tempfile
+import faiss
+import numpy as np
 from dotenv import load_dotenv
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.docstore.document import Document
 from keybert import KeyBERT
 from textblob import TextBlob
+from groq import Groq
+# Load environment
 load_dotenv()
+client = Groq(api_key=os.environ.get("wbm1"))
 GROQ_MODEL = "llama3-8b-8192"
+# Streamlit setup
+st.set_page_config(page_title="🧠 Smart PDF ChatBot", layout="centered")
+st.title("💬 Smart PDF ChatBot")
 st.markdown("""
+Upload one or more PDFs. Get summaries, insights, and interact with AI about the content using a persistent memory chat.
 """)
+uploaded_files = st.file_uploader("📁 Upload PDF files", type=["pdf"], accept_multiple_files=True)
+# Utilities
 def extract_text_from_pdf(file):
     doc = fitz.open(stream=file.read(), filetype="pdf")
     text = ""
         text += page.get_text()
     return text
+def split_text(text):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    return splitter.split_text(text)
+def create_vector_store(chunks):
+    documents = [Document(page_content=c) for c in chunks]
+    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+    return FAISS.from_documents(documents, embeddings)
+def summarize_text(text):
+    response = client.chat.completions.create(
+        model=GROQ_MODEL,
+        messages=[
+            {"role": "system", "content": "You are an AI that summarizes documents."},
+            {"role": "user", "content": f"Summarize this:\n{text}"}
+        ]
+    )
+    return response.choices[0].message.content
+def ask_question(vectorstore, question):
+    docs = vectorstore.similarity_search(question, k=3)
+    context = "\n".join([d.page_content for d in docs])
+    response = client.chat.completions.create(
+        model=GROQ_MODEL,
+        messages=[
+            {"role": "system", "content": "You answer questions based on document context."},
+            {"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"}
+        ]
+    )
+    return response.choices[0].message.content
 def extract_keywords(text, top_n=10):
     kw_model = KeyBERT()
     keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
     return [kw[0] for kw in keywords]
 def get_sentiment(text):
     blob = TextBlob(text)
     polarity = blob.sentiment.polarity
     else:
         return "😐 Neutral"
 def make_download_button(text, filename="summary.txt"):
     st.download_button("💾 Download Summary", data=text, file_name=filename, mime="text/plain")
+# App logic
+if uploaded_files:
+    all_text = ""
+    for file in uploaded_files:
+        st.write(f"📄 Processing {file.name}...")
+        text = extract_text_from_pdf(file)
+        all_text += f"\n\n{text}"
+    st.subheader("🔍 Extracting Insights...")
+    chunks = split_text(all_text)
+    vectorstore = create_vector_store(chunks)
+    st.write("📄 Generating summary...")
+    summary = summarize_text(all_text)
+    st.success(summary)
+    make_download_button(summary)
+    st.subheader("🔑 Keywords")
+    keywords = extract_keywords(summary)
+    st.write(", ".join(keywords))
+    st.subheader("📊 Sentiment")
+    sentiment = get_sentiment(summary)
+    st.write(sentiment)
+    st.markdown("---")
+    st.subheader("💬 Ask a question about the documents")
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+    user_question = st.text_input("Type your question")
+    if user_question:
+        with st.spinner("🤖 Thinking..."):
+            answer = ask_question(vectorstore, user_question)
+            st.session_state.chat_history.append((user_question, answer))
+    for q, a in st.session_state.chat_history:
+        st.markdown(f"**You:** {q}")
+        st.markdown(f"**AI:** {a}")
 else:
+    st.info("📥 Upload one or more PDF files to get started.")