waqasbm commited on
Commit
27d2624
Β·
verified Β·
1 Parent(s): 8b7511d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -79
app.py CHANGED
@@ -1,30 +1,34 @@
1
  import streamlit as st
2
  import fitz # PyMuPDF
3
- import requests
4
  import os
 
 
 
 
5
  from dotenv import load_dotenv
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
7
  from keybert import KeyBERT
8
  from textblob import TextBlob
 
9
 
10
- # Setup
11
  load_dotenv()
12
- GROQ_API_KEY = os.getenv("wbm1")
13
- GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
14
  GROQ_MODEL = "llama3-8b-8192"
15
 
16
- st.set_page_config(page_title="🧠 Smart PDF Extractor", layout="centered")
17
- st.title("πŸ“„ Smart PDF Extractor & AI Summarizer")
18
-
19
  st.markdown("""
20
- Extract summaries, insights, keywords, and sentiment from your PDFs using AI.
21
  """)
22
 
23
- uploaded_file = st.file_uploader("πŸ“ Upload your PDF file", type=["pdf"])
24
-
25
-
26
- # ---------- Utilities ----------
27
 
 
28
  def extract_text_from_pdf(file):
29
  doc = fitz.open(stream=file.read(), filetype="pdf")
30
  text = ""
@@ -32,38 +36,42 @@ def extract_text_from_pdf(file):
32
  text += page.get_text()
33
  return text
34
 
35
-
36
- def split_text_langchain(text, chunk_size=3000, chunk_overlap=200):
37
- splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
38
- chunks = splitter.split_text(text)
39
- return chunks
40
-
41
-
42
- def summarize_chunk(text, prompt):
43
- headers = {
44
- "Authorization": f"Bearer {GROQ_API_KEY}",
45
- "Content-Type": "application/json"
46
- }
47
- payload = {
48
- "model": GROQ_MODEL,
49
- "messages": [
50
- {"role": "system", "content": prompt},
51
- {"role": "user", "content": text}
52
- ],
53
- "temperature": 0.3,
54
- "max_tokens": 1024
55
- }
56
- response = requests.post(GROQ_API_URL, headers=headers, json=payload)
57
- response.raise_for_status()
58
- return response.json()["choices"][0]["message"]["content"]
59
-
 
 
 
 
 
60
 
61
  def extract_keywords(text, top_n=10):
62
  kw_model = KeyBERT()
63
  keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
64
  return [kw[0] for kw in keywords]
65
 
66
-
67
  def get_sentiment(text):
68
  blob = TextBlob(text)
69
  polarity = blob.sentiment.polarity
@@ -74,48 +82,47 @@ def get_sentiment(text):
74
  else:
75
  return "😐 Neutral"
76
 
77
-
78
  def make_download_button(text, filename="summary.txt"):
79
  st.download_button("πŸ’Ύ Download Summary", data=text, file_name=filename, mime="text/plain")
80
 
81
-
82
- # ---------- Main Logic ----------
83
-
84
- if uploaded_file:
85
- with st.spinner("🧠 Reading and analyzing PDF..."):
86
- pdf_text = extract_text_from_pdf(uploaded_file)
87
- chunks = split_text_langchain(pdf_text)
88
-
89
- prompt = (
90
- "Summarize the following text clearly. Focus on main ideas, insights, data points, and useful information."
91
- )
92
-
93
- summaries = []
94
- for i, chunk in enumerate(chunks):
95
- st.write(f"⏳ Summarizing part {i + 1}/{len(chunks)}...")
96
- try:
97
- summary = summarize_chunk(chunk, prompt)
98
- summaries.append(summary)
99
- except Exception as e:
100
- st.error(f"Error summarizing chunk {i + 1}: {e}")
101
- break
102
-
103
- if summaries:
104
- final_summary = "\n\n".join(summaries)
105
-
106
- st.subheader("βœ… Final Summary")
107
- st.success(final_summary)
108
-
109
- make_download_button(final_summary)
110
-
111
- st.markdown("---")
112
- st.subheader("πŸ”‘ Keywords")
113
- keywords = extract_keywords(final_summary)
114
- st.write(", ".join(keywords))
115
-
116
- st.subheader("πŸ“Š Sentiment")
117
- sentiment = get_sentiment(final_summary)
118
- st.write(sentiment)
119
-
120
  else:
121
- st.info("πŸ“₯ Upload a PDF to begin.")
 
1
  import streamlit as st
2
  import fitz # PyMuPDF
 
3
  import os
4
+ import time
5
+ import tempfile
6
+ import faiss
7
+ import numpy as np
8
  from dotenv import load_dotenv
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.embeddings import HuggingFaceEmbeddings
11
+ from langchain.vectorstores import FAISS
12
+ from langchain.docstore.document import Document
13
  from keybert import KeyBERT
14
  from textblob import TextBlob
15
+ from groq import Groq
16
 
17
+ # Load environment
18
  load_dotenv()
19
+ client = Groq(api_key=os.environ.get("wbm1"))
 
20
  GROQ_MODEL = "llama3-8b-8192"
21
 
22
+ # Streamlit setup
23
+ st.set_page_config(page_title="🧠 Smart PDF ChatBot", layout="centered")
24
+ st.title("πŸ’¬ Smart PDF ChatBot")
25
  st.markdown("""
26
+ Upload one or more PDFs. Get summaries, insights, and interact with AI about the content using a persistent memory chat.
27
  """)
28
 
29
+ uploaded_files = st.file_uploader("πŸ“ Upload PDF files", type=["pdf"], accept_multiple_files=True)
 
 
 
30
 
31
+ # Utilities
32
  def extract_text_from_pdf(file):
33
  doc = fitz.open(stream=file.read(), filetype="pdf")
34
  text = ""
 
36
  text += page.get_text()
37
  return text
38
 
39
+ def split_text(text):
40
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
41
+ return splitter.split_text(text)
42
+
43
+ def create_vector_store(chunks):
44
+ documents = [Document(page_content=c) for c in chunks]
45
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
46
+ return FAISS.from_documents(documents, embeddings)
47
+
48
+ def summarize_text(text):
49
+ response = client.chat.completions.create(
50
+ model=GROQ_MODEL,
51
+ messages=[
52
+ {"role": "system", "content": "You are an AI that summarizes documents."},
53
+ {"role": "user", "content": f"Summarize this:\n{text}"}
54
+ ]
55
+ )
56
+ return response.choices[0].message.content
57
+
58
+ def ask_question(vectorstore, question):
59
+ docs = vectorstore.similarity_search(question, k=3)
60
+ context = "\n".join([d.page_content for d in docs])
61
+ response = client.chat.completions.create(
62
+ model=GROQ_MODEL,
63
+ messages=[
64
+ {"role": "system", "content": "You answer questions based on document context."},
65
+ {"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}"}
66
+ ]
67
+ )
68
+ return response.choices[0].message.content
69
 
70
  def extract_keywords(text, top_n=10):
71
  kw_model = KeyBERT()
72
  keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
73
  return [kw[0] for kw in keywords]
74
 
 
75
  def get_sentiment(text):
76
  blob = TextBlob(text)
77
  polarity = blob.sentiment.polarity
 
82
  else:
83
  return "😐 Neutral"
84
 
 
85
  def make_download_button(text, filename="summary.txt"):
86
  st.download_button("πŸ’Ύ Download Summary", data=text, file_name=filename, mime="text/plain")
87
 
88
+ # App logic
89
+ if uploaded_files:
90
+ all_text = ""
91
+ for file in uploaded_files:
92
+ st.write(f"πŸ“„ Processing {file.name}...")
93
+ text = extract_text_from_pdf(file)
94
+ all_text += f"\n\n{text}"
95
+
96
+ st.subheader("πŸ” Extracting Insights...")
97
+ chunks = split_text(all_text)
98
+ vectorstore = create_vector_store(chunks)
99
+
100
+ st.write("πŸ“„ Generating summary...")
101
+ summary = summarize_text(all_text)
102
+ st.success(summary)
103
+ make_download_button(summary)
104
+
105
+ st.subheader("πŸ”‘ Keywords")
106
+ keywords = extract_keywords(summary)
107
+ st.write(", ".join(keywords))
108
+
109
+ st.subheader("πŸ“Š Sentiment")
110
+ sentiment = get_sentiment(summary)
111
+ st.write(sentiment)
112
+
113
+ st.markdown("---")
114
+ st.subheader("πŸ’¬ Ask a question about the documents")
115
+ if "chat_history" not in st.session_state:
116
+ st.session_state.chat_history = []
117
+
118
+ user_question = st.text_input("Type your question")
119
+ if user_question:
120
+ with st.spinner("πŸ€– Thinking..."):
121
+ answer = ask_question(vectorstore, user_question)
122
+ st.session_state.chat_history.append((user_question, answer))
123
+
124
+ for q, a in st.session_state.chat_history:
125
+ st.markdown(f"**You:** {q}")
126
+ st.markdown(f"**AI:** {a}")
127
  else:
128
+ st.info("πŸ“₯ Upload one or more PDF files to get started.")