waqasbm commited on
Commit
1217112
Β·
verified Β·
1 Parent(s): 4fa332a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -25
app.py CHANGED
@@ -3,22 +3,27 @@ import fitz # PyMuPDF
3
  import requests
4
  import os
5
  from dotenv import load_dotenv
 
 
 
6
 
7
- # Load environment variables
8
  load_dotenv()
9
- GROQ_API_KEY = os.getenv("wbm1") # Put this in your .env file or Hugging Face secrets
10
  GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
11
- GROQ_MODEL = "llama3-8b-8192" # or use llama3-70b-8192 for more power
12
 
13
- st.set_page_config(page_title="πŸ“„ PDF Data Extractor AI", layout="centered")
14
- st.title("πŸ“„ Intelligent PDF Data Extractor & Summarizer")
15
 
16
  st.markdown("""
17
- Upload a PDF and extract key insights automatically using AI.
18
- This tool helps improve decision-making, reduce errors, and boost productivity.
19
  """)
20
 
21
- uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
 
 
 
22
 
23
  def extract_text_from_pdf(file):
24
  doc = fitz.open(stream=file.read(), filetype="pdf")
@@ -27,7 +32,14 @@ def extract_text_from_pdf(file):
27
  text += page.get_text()
28
  return text
29
 
30
- def query_groq(text, system_prompt):
 
 
 
 
 
 
 
31
  headers = {
32
  "Authorization": f"Bearer {GROQ_API_KEY}",
33
  "Content-Type": "application/json"
@@ -35,36 +47,75 @@ def query_groq(text, system_prompt):
35
  payload = {
36
  "model": GROQ_MODEL,
37
  "messages": [
38
- {"role": "system", "content": system_prompt},
39
  {"role": "user", "content": text}
40
  ],
41
- "temperature": 0.2,
42
  "max_tokens": 1024
43
  }
44
  response = requests.post(GROQ_API_URL, headers=headers, json=payload)
45
  response.raise_for_status()
46
  return response.json()["choices"][0]["message"]["content"]
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  if uploaded_file:
49
- with st.spinner("πŸ” Extracting and summarizing..."):
50
- raw_text = extract_text_from_pdf(uploaded_file)
 
51
 
52
- # Summarize using GROQ
53
  prompt = (
54
- "You are an intelligent PDF data assistant. Read the document and extract a clear summary. "
55
- "Highlight key insights, decisions, data points, and actionable information. "
56
- "Return a structured summary that enhances decision-making and productivity."
57
  )
58
 
59
- try:
60
- summary = query_groq(raw_text, prompt)
61
- st.subheader("🧠 Extracted Summary")
62
- st.success(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  st.markdown("---")
65
- st.caption("βœ… Powered by GROQ LLaMA and PyMuPDF. Safe and secure local processing.")
 
 
 
 
 
 
66
 
67
- except Exception as e:
68
- st.error(f"❌ Failed to extract summary: {e}")
69
  else:
70
- st.info("πŸ“₯ Please upload a PDF file to begin.")
 
3
  import requests
4
  import os
5
  from dotenv import load_dotenv
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from keybert import KeyBERT
8
+ from textblob import TextBlob
9
 
10
+ # Setup
11
  load_dotenv()
12
+ GROQ_API_KEY = os.getenv("wbm1")
13
  GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
14
+ GROQ_MODEL = "llama3-8b-8192"
15
 
16
+ st.set_page_config(page_title="🧠 Smart PDF Extractor", layout="centered")
17
+ st.title("πŸ“„ Smart PDF Extractor & AI Summarizer")
18
 
19
  st.markdown("""
20
+ Extract summaries, insights, keywords, and sentiment from your PDFs using AI.
 
21
  """)
22
 
23
+ uploaded_file = st.file_uploader("πŸ“ Upload your PDF file", type=["pdf"])
24
+
25
+
26
+ # ---------- Utilities ----------
27
 
28
  def extract_text_from_pdf(file):
29
  doc = fitz.open(stream=file.read(), filetype="pdf")
 
32
  text += page.get_text()
33
  return text
34
 
35
+
36
+ def split_text_langchain(text, chunk_size=3000, chunk_overlap=200):
37
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
38
+ chunks = splitter.split_text(text)
39
+ return chunks
40
+
41
+
42
+ def summarize_chunk(text, prompt):
43
  headers = {
44
  "Authorization": f"Bearer {GROQ_API_KEY}",
45
  "Content-Type": "application/json"
 
47
  payload = {
48
  "model": GROQ_MODEL,
49
  "messages": [
50
+ {"role": "system", "content": prompt},
51
  {"role": "user", "content": text}
52
  ],
53
+ "temperature": 0.3,
54
  "max_tokens": 1024
55
  }
56
  response = requests.post(GROQ_API_URL, headers=headers, json=payload)
57
  response.raise_for_status()
58
  return response.json()["choices"][0]["message"]["content"]
59
 
60
+
61
+ def extract_keywords(text, top_n=10):
62
+ kw_model = KeyBERT()
63
+ keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
64
+ return [kw[0] for kw in keywords]
65
+
66
+
67
+ def get_sentiment(text):
68
+ blob = TextBlob(text)
69
+ polarity = blob.sentiment.polarity
70
+ if polarity > 0.2:
71
+ return "😊 Positive"
72
+ elif polarity < -0.2:
73
+ return "😞 Negative"
74
+ else:
75
+ return "😐 Neutral"
76
+
77
+
78
+ def make_download_button(text, filename="summary.txt"):
79
+ st.download_button("πŸ’Ύ Download Summary", data=text, file_name=filename, mime="text/plain")
80
+
81
+
82
+ # ---------- Main Logic ----------
83
+
84
  if uploaded_file:
85
+ with st.spinner("🧠 Reading and analyzing PDF..."):
86
+ pdf_text = extract_text_from_pdf(uploaded_file)
87
+ chunks = split_text_langchain(pdf_text)
88
 
 
89
  prompt = (
90
+ "Summarize the following text clearly. Focus on main ideas, insights, data points, and useful information."
 
 
91
  )
92
 
93
+ summaries = []
94
+ for i, chunk in enumerate(chunks):
95
+ st.write(f"⏳ Summarizing part {i + 1}/{len(chunks)}...")
96
+ try:
97
+ summary = summarize_chunk(chunk, prompt)
98
+ summaries.append(summary)
99
+ except Exception as e:
100
+ st.error(f"Error summarizing chunk {i + 1}: {e}")
101
+ break
102
+
103
+ if summaries:
104
+ final_summary = "\n\n".join(summaries)
105
+
106
+ st.subheader("βœ… Final Summary")
107
+ st.success(final_summary)
108
+
109
+ make_download_button(final_summary)
110
 
111
  st.markdown("---")
112
+ st.subheader("πŸ”‘ Keywords")
113
+ keywords = extract_keywords(final_summary)
114
+ st.write(", ".join(keywords))
115
+
116
+ st.subheader("πŸ“Š Sentiment")
117
+ sentiment = get_sentiment(final_summary)
118
+ st.write(sentiment)
119
 
 
 
120
  else:
121
+ st.info("πŸ“₯ Upload a PDF to begin.")