Ani14 commited on
Commit
bd2e62c
Β·
verified Β·
1 Parent(s): 27f01b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -182
app.py CHANGED
@@ -2,26 +2,56 @@ import os
2
  import streamlit as st
3
  import requests
4
  import datetime
5
- import feedparser
6
- import time
7
  from dotenv import load_dotenv
8
  from tavily import TavilyClient
 
 
9
  from fuzzywuzzy import fuzz
10
- from urllib.parse import quote_plus
11
- from PIL import Image
12
- from io import BytesIO
13
  from fpdf import FPDF
14
 
15
- # --- Load Keys ---
16
  load_dotenv()
17
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
18
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
19
  tavily = TavilyClient(api_key=TAVILY_API_KEY)
20
 
21
- # --- Layout ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  st.set_page_config("Deep Research Bot", layout="wide")
 
23
  with st.sidebar:
24
- st.title("🧭 Research Input")
25
  topic = st.text_input("πŸ’‘ What would you like me to research next?")
26
  report_type = st.selectbox("πŸ“„ Type of report", [
27
  "Summary - Short and fast (~2 min)",
@@ -37,186 +67,77 @@ with st.sidebar:
37
  "Web Only", "Academic Only", "Hybrid"
38
  ])
39
  custom_domains = st.text_input("πŸ” Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
 
40
 
41
  st.title("πŸ€– Real-time Deep Research Agent (Tavily Edition)")
42
- st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time using Tavily, ArXiv, and Semantic Scholar.")
43
 
44
- # --- Helper Functions ---
45
- def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
46
- url = "https://openrouter.ai/api/v1/chat/completions"
47
- headers = {
48
- "Authorization": f"Bearer {OPENROUTER_API_KEY}",
49
- "Content-Type": "application/json",
50
- "X-Title": "GPT Deep Research Agent"
51
- }
52
- data = {
53
- "model": model,
54
- "messages": messages,
55
- "max_tokens": max_tokens,
56
- "temperature": temperature
57
- }
58
- response = requests.post(url, headers=headers, json=data)
59
- result = response.json()
60
- if response.status_code != 200:
61
- raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
62
- return result["choices"][0]["message"]["content"]
63
-
64
- def get_sources(topic, domains=None):
65
- query = topic
66
- if domains:
67
- domain_filters = [d.strip() for d in domains.split(",") if d.strip()]
68
- query += " site:" + " OR site:".join(domain_filters)
69
-
70
- response = tavily.search(query=query, search_depth="advanced", max_results=10)
71
- sources = []
72
- for item in response.get("results", []):
73
- sources.append({
74
- "title": item.get("title"),
75
- "url": item.get("url"),
76
- "snippet": item.get("content", "")
77
- })
78
- return sources
79
-
80
- def get_arxiv_papers(query):
81
- url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=5"
82
- feed = feedparser.parse(url)
83
- return [{
84
- "title": e.title,
85
- "summary": e.summary.replace("\n", " ").strip(),
86
- "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
87
- } for e in feed.entries]
88
-
89
- def get_semantic_papers(query):
90
- url = "https://api.semanticscholar.org/graph/v1/paper/search"
91
- params = {"query": query, "limit": 5, "fields": "title,abstract,url"}
92
- response = requests.get(url, params=params)
93
- papers = response.json().get("data", [])
94
- return [{
95
- "title": p.get("title"),
96
- "summary": p.get("abstract", "No abstract available"),
97
- "url": p.get("url")
98
- } for p in papers]
99
-
100
- def generate_apa_citation(title, url, source):
101
- year = datetime.datetime.now().year
102
- label = {
103
- "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
104
- }.get(source, "*Web*")
105
- return f"{title}. ({year}). {label}. {url}"
106
-
107
- def check_plagiarism(text, topic):
108
- hits = []
109
- for r in get_sources(topic, ""):
110
- similarity = fuzz.token_set_ratio(text, r["snippet"])
111
- if similarity >= 75:
112
- hits.append(r)
113
- return hits
114
-
115
- def remove_duplicates(entries):
116
- unique = []
117
- titles = []
118
- for e in entries:
119
- if all(fuzz.token_set_ratio(e["title"], t) < 85 for t in titles):
120
- titles.append(e["title"])
121
- unique.append(e)
122
- return unique
123
-
124
- def generate_image_from_topic(topic):
125
- img_prompt = f"Illustration representing '{topic}' in a research or technology context."
126
- image_url = f"https://source.unsplash.com/featured/?{quote_plus(topic)}"
127
- return image_url
128
-
129
- def generate_pdf(text):
130
- pdf = FPDF()
131
- pdf.add_page()
132
- pdf.set_auto_page_break(auto=True, margin=15)
133
- pdf.set_font("Arial", size=12)
134
- for line in text.split("\n"):
135
- pdf.multi_cell(0, 10, line)
136
- buffer = BytesIO()
137
- pdf.output(buffer)
138
- buffer.seek(0)
139
- return buffer
140
-
141
- # --- Execution ---
142
- if st.button("Research"):
143
  try:
144
- with st.spinner("πŸ” Gathering relevant research..."):
145
- all_entries = []
146
- citations = []
 
147
 
 
 
148
  if source_type in ["Web Only", "Hybrid"]:
149
- web_data = get_sources(topic, custom_domains)
150
- web_data = remove_duplicates(web_data)
151
- for w in web_data:
152
- all_entries.append({
153
- "title": w['title'],
154
- "summary": w['snippet'],
155
- "url": w['url'],
156
- "source": "web"
157
- })
158
- citations.append(generate_apa_citation(w['title'], w['url'], "web"))
159
-
160
  if source_type in ["Academic Only", "Hybrid"]:
161
- arxiv_data = get_arxiv_papers(topic)
162
- semantic_data = get_semantic_papers(topic)
163
- academic_data = remove_duplicates(arxiv_data + semantic_data)
164
- for a in academic_data:
165
- all_entries.append({
166
- "title": a['title'],
167
- "summary": a['summary'],
168
- "url": a['url'],
169
- "source": "arxiv" if "arxiv" in a['url'] else "semantic"
170
- })
171
- citations.append(generate_apa_citation(a['title'], a['url'], a['source']))
172
-
173
- st.success("βœ… Data collected and filtered!")
174
-
175
- with st.spinner("🧠 Writing final research report..."):
176
- sources_text = ""
177
- for e in all_entries:
178
- sources_text += f"- [{e['title']}]({e['url']})\n> {e['summary'][:300]}...\n\n"
179
-
180
- prompt = f"""
181
- # Research Task: {topic}
182
- Tone: {tone}
183
- Report Type: {report_type}
184
- Sources:
185
- {sources_text}
186
- Now, synthesize:
187
- 1. Research questions and gap
188
- 2. A novel insight or direction
189
- 3. A real-world application scenario
190
- 4. A {report_type.lower()} in paragraph format (use bullet points only if the paragraph is too long).
191
- Use larger heading for sections and slightly smaller for sub-sections. Do not use markdown or HTML, just plain text.
192
- """
193
- output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
194
-
195
- st.header("πŸ“„ Research Report")
196
- st.write(output)
197
-
198
- st.subheader("πŸ“š APA Citations")
199
- for c in citations:
200
- st.markdown(f"- {c}")
201
-
202
- with st.spinner("πŸ§ͺ Checking for overlaps..."):
203
- overlaps = check_plagiarism(output, topic)
204
- if overlaps:
205
- st.warning("⚠️ Potential content overlap found.")
206
- for h in overlaps:
207
- st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
208
- else:
209
- st.success("βœ… No major overlaps detected.")
210
-
211
- if report_type.startswith("Thorough"):
212
- st.subheader("πŸ–ΌοΈ Related Visual")
213
- image_url = generate_image_from_topic(topic)
214
- st.image(image_url, caption=f"Visual related to: {topic}", use_column_width=True)
215
-
216
- st.subheader("πŸ“₯ Download Options")
217
- pdf_file = generate_pdf(output)
218
- st.download_button("πŸ“„ Download PDF", data=pdf_file, file_name=f"{topic}_report.pdf", mime="application/pdf")
219
- st.download_button("πŸ“œ Download LaTeX (raw text)", data=output, file_name=f"{topic}_report.tex", mime="text/plain")
220
 
221
  except Exception as e:
222
- st.error(f"Error: {e}")
 
2
  import streamlit as st
3
  import requests
4
  import datetime
 
 
5
  from dotenv import load_dotenv
6
  from tavily import TavilyClient
7
+ import feedparser
8
+ import time
9
  from fuzzywuzzy import fuzz
 
 
 
10
  from fpdf import FPDF
11
 
12
+ # Load environment variables
13
  load_dotenv()
14
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
15
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
16
  tavily = TavilyClient(api_key=TAVILY_API_KEY)
17
 
18
+ # --- Helper Functions ---
19
+ def get_sources(topic, domains=None):
20
+ query = topic
21
+ if domains:
22
+ domain_filters = [d.strip() for d in domains.split(",") if d.strip()]
23
+ query += " site:" + " OR site:".join(domain_filters)
24
+
25
+ response = tavily.search(query=query, search_depth="advanced", max_results=10)
26
+ sources = []
27
+ for item in response.get("results", []):
28
+ sources.append({
29
+ "title": item.get("title"),
30
+ "url": item.get("url"),
31
+ "snippet": item.get("content", "")
32
+ })
33
+ return sources
34
+
35
+ def merge_duplicates(data):
36
+ seen_titles = {}
37
+ unique_data = []
38
+ for item in data:
39
+ title = item['title']
40
+ is_duplicate = False
41
+ for seen_title in seen_titles:
42
+ if fuzz.ratio(title.lower(), seen_title.lower()) > 85:
43
+ is_duplicate = True
44
+ break
45
+ if not is_duplicate:
46
+ seen_titles[title] = True
47
+ unique_data.append(item)
48
+ return unique_data
49
+
50
+ # --- Streamlit UI ---
51
  st.set_page_config("Deep Research Bot", layout="wide")
52
+
53
  with st.sidebar:
54
+ st.header("πŸ§ͺ Research Configuration")
55
  topic = st.text_input("πŸ’‘ What would you like me to research next?")
56
  report_type = st.selectbox("πŸ“„ Type of report", [
57
  "Summary - Short and fast (~2 min)",
 
67
  "Web Only", "Academic Only", "Hybrid"
68
  ])
69
  custom_domains = st.text_input("πŸ” Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
70
+ research_triggered = st.button("πŸ”Ž Start Research")
71
 
72
  st.title("πŸ€– Real-time Deep Research Agent (Tavily Edition)")
73
+ st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time using Tavily.")
74
 
75
+ if research_triggered:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  try:
77
+ with st.status("Starting agent tasks..."):
78
+ st.info("🧠 Thinking through research questions...")
79
+ time.sleep(1)
80
+ st.info("🌐 Fetching data from selected sources...")
81
 
82
+ raw_data = []
83
+ citations = []
84
  if source_type in ["Web Only", "Hybrid"]:
85
+ web = get_sources(topic, custom_domains)
86
+ raw_data.extend(web)
87
+ for w in web:
88
+ citations.append(generate_apa_citation(w["title"], w["url"], "web"))
 
 
 
 
 
 
 
89
  if source_type in ["Academic Only", "Hybrid"]:
90
+ arxiv = get_arxiv_papers(topic)
91
+ scholar = get_semantic_papers(topic)
92
+ raw_data.extend(arxiv)
93
+ raw_data.extend(scholar)
94
+ for p in arxiv:
95
+ citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
96
+ for s in scholar:
97
+ citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
98
+
99
+ # Merge duplicates before formatting
100
+ filtered_data = merge_duplicates(raw_data)
101
+ all_data = ""
102
+ for item in filtered_data:
103
+ summary = item.get("snippet") or item.get("summary", "")
104
+ all_data += f"- [{item['title']}]({item['url']})\n> {summary[:300]}...\n\n"
105
+
106
+ st.success("Data collection complete!")
107
+
108
+ with st.spinner("πŸ“ Writing final research report..."):
109
+ prompt = generate_prompt(report_type, tone, topic, all_data)
110
+ output = call_llm([{"role": "user", "content": prompt}], max_tokens=4000)
111
+
112
+ st.markdown("## πŸ“„ Research Report")
113
+ st.markdown(f"<div style='font-size:16px;'>{output}</div>", unsafe_allow_html=True)
114
+
115
+ if "Detailed" in report_type or "Thorough" in report_type:
116
+ st.markdown("## πŸ“š APA Citations")
117
+ for c in citations:
118
+ st.markdown(f"- {c}")
119
+
120
+ if "Thorough" in report_type:
121
+ image_links = get_image_links(topic)
122
+ if image_links:
123
+ st.markdown("## πŸ–ΌοΈ Related Visuals")
124
+ for img in image_links:
125
+ st.image(img, use_column_width=True)
126
+
127
+ with st.spinner("πŸ§ͺ Checking for overlaps..."):
128
+ overlaps = check_plagiarism(output, topic)
129
+ if overlaps:
130
+ st.warning("⚠️ Potential content overlap found.")
131
+ for h in overlaps:
132
+ st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
133
+ else:
134
+ st.success("βœ… No major overlaps detected.")
135
+
136
+ pdf_path = save_pdf(output)
137
+ with open(pdf_path, "rb") as pdf_file:
138
+ st.download_button("πŸ“„ Download PDF", pdf_file, file_name="research_report.pdf")
139
+
140
+ st.download_button("πŸ“„ Download LaTeX", output.encode("utf-8"), file_name="research_report.tex")
 
 
 
 
 
 
 
 
141
 
142
  except Exception as e:
143
+ st.error(f"Error: {e}")