Ani14 commited on
Commit
05644a0
Β·
verified Β·
1 Parent(s): bd2e62c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -86
app.py CHANGED
@@ -7,21 +7,42 @@ from tavily import TavilyClient
7
  import feedparser
8
  import time
9
  from fuzzywuzzy import fuzz
 
 
10
  from fpdf import FPDF
 
11
 
12
  # Load environment variables
13
  load_dotenv()
14
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
15
- TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
16
  tavily = TavilyClient(api_key=TAVILY_API_KEY)
17
 
18
  # --- Helper Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def get_sources(topic, domains=None):
20
  query = topic
21
  if domains:
22
  domain_filters = [d.strip() for d in domains.split(",") if d.strip()]
23
  query += " site:" + " OR site:".join(domain_filters)
24
-
25
  response = tavily.search(query=query, search_depth="advanced", max_results=10)
26
  sources = []
27
  for item in response.get("results", []):
@@ -32,27 +53,84 @@ def get_sources(topic, domains=None):
32
  })
33
  return sources
34
 
35
- def merge_duplicates(data):
36
- seen_titles = {}
37
- unique_data = []
38
- for item in data:
39
- title = item['title']
40
- is_duplicate = False
41
- for seen_title in seen_titles:
42
- if fuzz.ratio(title.lower(), seen_title.lower()) > 85:
43
- is_duplicate = True
44
- break
45
- if not is_duplicate:
46
- seen_titles[title] = True
47
- unique_data.append(item)
48
- return unique_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # --- Streamlit UI ---
51
  st.set_page_config("Deep Research Bot", layout="wide")
52
 
53
  with st.sidebar:
54
- st.header("πŸ§ͺ Research Configuration")
55
- topic = st.text_input("πŸ’‘ What would you like me to research next?")
56
  report_type = st.selectbox("πŸ“„ Type of report", [
57
  "Summary - Short and fast (~2 min)",
58
  "Detailed Report (~5 min)",
@@ -63,81 +141,76 @@ with st.sidebar:
63
  "Persuasive - Advocating a specific point of view",
64
  "Narrative - Storytelling tone for layperson readers"
65
  ])
66
- source_type = st.selectbox("🌐 Sources to include", [
67
- "Web Only", "Academic Only", "Hybrid"
68
- ])
69
  custom_domains = st.text_input("πŸ” Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
70
- research_triggered = st.button("πŸ”Ž Start Research")
71
 
72
- st.title("πŸ€– Real-time Deep Research Agent (Tavily Edition)")
73
- st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time using Tavily.")
74
 
75
- if research_triggered:
76
  try:
77
- with st.status("Starting agent tasks..."):
78
- st.info("🧠 Thinking through research questions...")
79
- time.sleep(1)
80
- st.info("🌐 Fetching data from selected sources...")
81
 
82
- raw_data = []
83
  citations = []
 
84
  if source_type in ["Web Only", "Hybrid"]:
85
- web = get_sources(topic, custom_domains)
86
- raw_data.extend(web)
87
- for w in web:
88
- citations.append(generate_apa_citation(w["title"], w["url"], "web"))
89
  if source_type in ["Academic Only", "Hybrid"]:
90
- arxiv = get_arxiv_papers(topic)
91
- scholar = get_semantic_papers(topic)
92
- raw_data.extend(arxiv)
93
- raw_data.extend(scholar)
94
- for p in arxiv:
95
- citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
96
- for s in scholar:
97
- citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
98
-
99
- # Merge duplicates before formatting
100
- filtered_data = merge_duplicates(raw_data)
101
- all_data = ""
102
- for item in filtered_data:
103
- summary = item.get("snippet") or item.get("summary", "")
104
- all_data += f"- [{item['title']}]({item['url']})\n> {summary[:300]}...\n\n"
105
-
106
- st.success("Data collection complete!")
107
-
108
- with st.spinner("πŸ“ Writing final research report..."):
109
- prompt = generate_prompt(report_type, tone, topic, all_data)
110
- output = call_llm([{"role": "user", "content": prompt}], max_tokens=4000)
111
-
112
- st.markdown("## πŸ“„ Research Report")
113
- st.markdown(f"<div style='font-size:16px;'>{output}</div>", unsafe_allow_html=True)
114
-
115
- if "Detailed" in report_type or "Thorough" in report_type:
116
- st.markdown("## πŸ“š APA Citations")
117
- for c in citations:
118
- st.markdown(f"- {c}")
119
-
120
- if "Thorough" in report_type:
121
- image_links = get_image_links(topic)
122
- if image_links:
123
- st.markdown("## πŸ–ΌοΈ Related Visuals")
124
- for img in image_links:
125
- st.image(img, use_column_width=True)
126
-
127
- with st.spinner("πŸ§ͺ Checking for overlaps..."):
128
- overlaps = check_plagiarism(output, topic)
129
- if overlaps:
130
- st.warning("⚠️ Potential content overlap found.")
131
- for h in overlaps:
132
- st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
133
- else:
134
- st.success("βœ… No major overlaps detected.")
135
-
136
- pdf_path = save_pdf(output)
137
- with open(pdf_path, "rb") as pdf_file:
138
- st.download_button("πŸ“„ Download PDF", pdf_file, file_name="research_report.pdf")
139
-
140
- st.download_button("πŸ“„ Download LaTeX", output.encode("utf-8"), file_name="research_report.tex")
141
 
142
  except Exception as e:
143
- st.error(f"Error: {e}")
 
7
  import feedparser
8
  import time
9
  from fuzzywuzzy import fuzz
10
+ from PIL import Image
11
+ from io import BytesIO
12
  from fpdf import FPDF
13
+ import base64
14
 
15
  # Load environment variables
16
  load_dotenv()
17
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
18
+ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
19
  tavily = TavilyClient(api_key=TAVILY_API_KEY)
20
 
21
  # --- Helper Functions ---
22
+ def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=3500, temperature=0.7):
23
+ url = "https://openrouter.ai/api/v1/chat/completions"
24
+ headers = {
25
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
26
+ "Content-Type": "application/json",
27
+ "X-Title": "GPT Deep Research Agent"
28
+ }
29
+ data = {
30
+ "model": model,
31
+ "messages": messages,
32
+ "max_tokens": max_tokens,
33
+ "temperature": temperature
34
+ }
35
+ response = requests.post(url, headers=headers, json=data)
36
+ result = response.json()
37
+ if response.status_code != 200:
38
+ raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
39
+ return result["choices"][0]["message"]["content"]
40
+
41
  def get_sources(topic, domains=None):
42
  query = topic
43
  if domains:
44
  domain_filters = [d.strip() for d in domains.split(",") if d.strip()]
45
  query += " site:" + " OR site:".join(domain_filters)
 
46
  response = tavily.search(query=query, search_depth="advanced", max_results=10)
47
  sources = []
48
  for item in response.get("results", []):
 
53
  })
54
  return sources
55
 
56
+ def get_arxiv_papers(query):
57
+ from urllib.parse import quote_plus
58
+ url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=5"
59
+ feed = feedparser.parse(url)
60
+ return [{
61
+ "title": e.title,
62
+ "summary": e.summary.replace("\n", " ").strip(),
63
+ "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
64
+ } for e in feed.entries]
65
+
66
+ def get_semantic_papers(query):
67
+ url = "https://api.semanticscholar.org/graph/v1/paper/search"
68
+ params = {"query": query, "limit": 5, "fields": "title,abstract,url"}
69
+ response = requests.get(url, params=params)
70
+ papers = response.json().get("data", [])
71
+ return [{
72
+ "title": p.get("title"),
73
+ "summary": p.get("abstract", "No abstract available"),
74
+ "url": p.get("url")
75
+ } for p in papers]
76
+
77
+ def check_plagiarism(text, topic):
78
+ hits = []
79
+ for r in get_sources(topic):
80
+ similarity = fuzz.token_set_ratio(text, r["snippet"])
81
+ if similarity >= 75:
82
+ hits.append(r)
83
+ return hits
84
+
85
+ def generate_apa_citation(title, url, source):
86
+ year = datetime.datetime.now().year
87
+ label = {
88
+ "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
89
+ }.get(source, "*Web*")
90
+ return f"{title}. ({year}). {label}. {url}"
91
+
92
+ def merge_duplicates(entries):
93
+ unique = []
94
+ seen_titles = []
95
+ for entry in entries:
96
+ if all(fuzz.token_set_ratio(entry['title'], seen) < 90 for seen in seen_titles):
97
+ unique.append(entry)
98
+ seen_titles.append(entry['title'])
99
+ return unique
100
+
101
+ def generate_pdf(text):
102
+ pdf = FPDF()
103
+ pdf.add_page()
104
+ pdf.set_auto_page_break(auto=True, margin=15)
105
+ pdf.set_font("Arial", size=12)
106
+ for line in text.split('\n'):
107
+ pdf.multi_cell(0, 10, line)
108
+ pdf_output = BytesIO()
109
+ pdf.output(pdf_output)
110
+ pdf_output.seek(0)
111
+ return pdf_output
112
+
113
+ def generate_latex(text):
114
+ latex = "\\documentclass{article}\n\\usepackage{hyperref}\n\\begin{document}\n"
115
+ for line in text.split('\n'):
116
+ latex += line.replace('_', '\\_') + "\\\\\n"
117
+ latex += "\\end{document}"
118
+ return BytesIO(latex.encode("utf-8"))
119
+
120
+ def generate_download_button(file, label, mime_type):
121
+ b64 = base64.b64encode(file.read()).decode()
122
+ return f"""
123
+ <a href="data:{mime_type};base64,{b64}" download="{label}">
124
+ πŸ“₯ Download {label}
125
+ </a>
126
+ """
127
 
128
  # --- Streamlit UI ---
129
  st.set_page_config("Deep Research Bot", layout="wide")
130
 
131
  with st.sidebar:
132
+ st.title("🧠 Deep Research Assistant")
133
+ topic = st.text_input("πŸ’‘ Topic to research")
134
  report_type = st.selectbox("πŸ“„ Type of report", [
135
  "Summary - Short and fast (~2 min)",
136
  "Detailed Report (~5 min)",
 
141
  "Persuasive - Advocating a specific point of view",
142
  "Narrative - Storytelling tone for layperson readers"
143
  ])
144
+ source_type = st.selectbox("🌐 Sources to include", ["Web Only", "Academic Only", "Hybrid"])
 
 
145
  custom_domains = st.text_input("πŸ” Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
146
+ research_button = st.button("Research")
147
 
148
+ st.title("πŸ“‘ Research Output")
 
149
 
150
+ if research_button and topic:
151
  try:
152
+ with st.status("πŸ” Gathering data..."):
153
+ st.info("Fetching from sources...")
 
 
154
 
155
+ all_sources = []
156
  citations = []
157
+
158
  if source_type in ["Web Only", "Hybrid"]:
159
+ web_data = get_sources(topic, custom_domains)
160
+ for item in web_data:
161
+ all_sources.append(item | {"source": "web"})
162
+
163
  if source_type in ["Academic Only", "Hybrid"]:
164
+ arxiv_data = get_arxiv_papers(topic)
165
+ for item in arxiv_data:
166
+ all_sources.append(item | {"source": "arxiv"})
167
+ semantic_data = get_semantic_papers(topic)
168
+ for item in semantic_data:
169
+ all_sources.append(item | {"source": "semantic"})
170
+
171
+ merged = merge_duplicates(all_sources)
172
+ combined_text = ""
173
+ for m in merged:
174
+ combined_text += f"- [{m['title']}]({m['url']})\n> {m.get('snippet', m.get('summary', ''))[:300]}...\n\n"
175
+ citations.append(generate_apa_citation(m['title'], m['url'], m['source']))
176
+
177
+ with st.spinner("✍️ Synthesizing report..."):
178
+ prompt = f"""
179
+ # Research Topic: {topic}
180
+ Tone: {tone}
181
+ Type: {report_type}
182
+ Sources:
183
+ {combined_text}
184
+ Write the report in academic markdown with paragraphs (use bullet points only when necessary). Include:
185
+ 1. Introduction
186
+ 2. Research Gap
187
+ 3. Novel Insight
188
+ 4. Application
189
+ 5. Full Academic Writeup if Thorough Report
190
+ """
191
+ final_output = call_llm([{"role": "user", "content": prompt}])
192
+
193
+ st.markdown(f"### πŸ“„ {report_type}")
194
+ st.markdown(final_output, unsafe_allow_html=True)
195
+
196
+ st.markdown("### πŸ“š Citations (APA Format)")
197
+ for cite in citations:
198
+ st.markdown(f"- {cite}")
199
+
200
+ if report_type == "Thorough Academic Research (~10 min)":
201
+ with st.spinner("πŸ“¦ Preparing PDF and LaTeX..."):
202
+ pdf_file = generate_pdf(final_output)
203
+ latex_file = generate_latex(final_output)
204
+ st.markdown(generate_download_button(pdf_file, "Research_Report.pdf", "application/pdf"), unsafe_allow_html=True)
205
+ st.markdown(generate_download_button(latex_file, "Research_Report.tex", "application/x-latex"), unsafe_allow_html=True)
206
+
207
+ overlaps = check_plagiarism(final_output, topic)
208
+ if overlaps:
209
+ st.warning("⚠️ Potential overlaps detected:")
210
+ for hit in overlaps:
211
+ st.markdown(f"- [{hit['title']}]({hit['url']})")
212
+ else:
213
+ st.success("βœ… No major overlaps found.")
 
214
 
215
  except Exception as e:
216
+ st.error(f"Error: {e}")