AutoReasearcher / app.py
Ani14's picture
Update app.py
27f01b8 verified
raw
history blame
8.44 kB
import os
import streamlit as st
import requests
import datetime
import feedparser
import time
from dotenv import load_dotenv
from tavily import TavilyClient
from fuzzywuzzy import fuzz
from urllib.parse import quote_plus
from PIL import Image
from io import BytesIO
from fpdf import FPDF
# --- Load Keys ---
load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
tavily = TavilyClient(api_key=TAVILY_API_KEY)
# --- Layout ---
st.set_page_config("Deep Research Bot", layout="wide")
with st.sidebar:
st.title("🧭 Research Input")
topic = st.text_input("πŸ’‘ What would you like me to research next?")
report_type = st.selectbox("πŸ“„ Type of report", [
"Summary - Short and fast (~2 min)",
"Detailed Report (~5 min)",
"Thorough Academic Research (~10 min)"
])
tone = st.selectbox("🎯 Tone of the report", [
"Objective - Impartial and unbiased presentation of facts and findings",
"Persuasive - Advocating a specific point of view",
"Narrative - Storytelling tone for layperson readers"
])
source_type = st.selectbox("🌐 Sources to include", [
"Web Only", "Academic Only", "Hybrid"
])
custom_domains = st.text_input("πŸ” Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
st.title("πŸ€– Real-time Deep Research Agent (Tavily Edition)")
st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time using Tavily, ArXiv, and Semantic Scholar.")
# --- Helper Functions ---
def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
url = "https://openrouter.ai/api/v1/chat/completions"
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
"X-Title": "GPT Deep Research Agent"
}
data = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature
}
response = requests.post(url, headers=headers, json=data)
result = response.json()
if response.status_code != 200:
raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
return result["choices"][0]["message"]["content"]
def get_sources(topic, domains=None):
query = topic
if domains:
domain_filters = [d.strip() for d in domains.split(",") if d.strip()]
query += " site:" + " OR site:".join(domain_filters)
response = tavily.search(query=query, search_depth="advanced", max_results=10)
sources = []
for item in response.get("results", []):
sources.append({
"title": item.get("title"),
"url": item.get("url"),
"snippet": item.get("content", "")
})
return sources
def get_arxiv_papers(query):
url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=5"
feed = feedparser.parse(url)
return [{
"title": e.title,
"summary": e.summary.replace("\n", " ").strip(),
"url": next((l.href for l in e.links if l.type == "application/pdf"), "")
} for e in feed.entries]
def get_semantic_papers(query):
url = "https://api.semanticscholar.org/graph/v1/paper/search"
params = {"query": query, "limit": 5, "fields": "title,abstract,url"}
response = requests.get(url, params=params)
papers = response.json().get("data", [])
return [{
"title": p.get("title"),
"summary": p.get("abstract", "No abstract available"),
"url": p.get("url")
} for p in papers]
def generate_apa_citation(title, url, source):
year = datetime.datetime.now().year
label = {
"arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
}.get(source, "*Web*")
return f"{title}. ({year}). {label}. {url}"
def check_plagiarism(text, topic):
hits = []
for r in get_sources(topic, ""):
similarity = fuzz.token_set_ratio(text, r["snippet"])
if similarity >= 75:
hits.append(r)
return hits
def remove_duplicates(entries):
unique = []
titles = []
for e in entries:
if all(fuzz.token_set_ratio(e["title"], t) < 85 for t in titles):
titles.append(e["title"])
unique.append(e)
return unique
def generate_image_from_topic(topic):
img_prompt = f"Illustration representing '{topic}' in a research or technology context."
image_url = f"https://source.unsplash.com/featured/?{quote_plus(topic)}"
return image_url
def generate_pdf(text):
pdf = FPDF()
pdf.add_page()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.set_font("Arial", size=12)
for line in text.split("\n"):
pdf.multi_cell(0, 10, line)
buffer = BytesIO()
pdf.output(buffer)
buffer.seek(0)
return buffer
# --- Execution ---
if st.button("Research"):
try:
with st.spinner("πŸ” Gathering relevant research..."):
all_entries = []
citations = []
if source_type in ["Web Only", "Hybrid"]:
web_data = get_sources(topic, custom_domains)
web_data = remove_duplicates(web_data)
for w in web_data:
all_entries.append({
"title": w['title'],
"summary": w['snippet'],
"url": w['url'],
"source": "web"
})
citations.append(generate_apa_citation(w['title'], w['url'], "web"))
if source_type in ["Academic Only", "Hybrid"]:
arxiv_data = get_arxiv_papers(topic)
semantic_data = get_semantic_papers(topic)
academic_data = remove_duplicates(arxiv_data + semantic_data)
for a in academic_data:
all_entries.append({
"title": a['title'],
"summary": a['summary'],
"url": a['url'],
"source": "arxiv" if "arxiv" in a['url'] else "semantic"
})
citations.append(generate_apa_citation(a['title'], a['url'], a['source']))
st.success("βœ… Data collected and filtered!")
with st.spinner("🧠 Writing final research report..."):
sources_text = ""
for e in all_entries:
sources_text += f"- [{e['title']}]({e['url']})\n> {e['summary'][:300]}...\n\n"
prompt = f"""
# Research Task: {topic}
Tone: {tone}
Report Type: {report_type}
Sources:
{sources_text}
Now, synthesize:
1. Research questions and gap
2. A novel insight or direction
3. A real-world application scenario
4. A {report_type.lower()} in paragraph format (use bullet points only if the paragraph is too long).
Use larger heading for sections and slightly smaller for sub-sections. Do not use markdown or HTML, just plain text.
"""
output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
st.header("πŸ“„ Research Report")
st.write(output)
st.subheader("πŸ“š APA Citations")
for c in citations:
st.markdown(f"- {c}")
with st.spinner("πŸ§ͺ Checking for overlaps..."):
overlaps = check_plagiarism(output, topic)
if overlaps:
st.warning("⚠️ Potential content overlap found.")
for h in overlaps:
st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
else:
st.success("βœ… No major overlaps detected.")
if report_type.startswith("Thorough"):
st.subheader("πŸ–ΌοΈ Related Visual")
image_url = generate_image_from_topic(topic)
st.image(image_url, caption=f"Visual related to: {topic}", use_column_width=True)
st.subheader("πŸ“₯ Download Options")
pdf_file = generate_pdf(output)
st.download_button("πŸ“„ Download PDF", data=pdf_file, file_name=f"{topic}_report.pdf", mime="application/pdf")
st.download_button("πŸ“œ Download LaTeX (raw text)", data=output, file_name=f"{topic}_report.tex", mime="text/plain")
except Exception as e:
st.error(f"Error: {e}")