Spaces:

Ani14
/

AutoReasearcher

Sleeping

App Files Files Community

AutoReasearcher / app.py

Ani14

Update app.py

27f01b8 verified about 1 year ago

raw

history blame

8.44 kB

	import os
	import streamlit as st
	import requests
	import datetime
	import feedparser
	import time
	from dotenv import load_dotenv
	from tavily import TavilyClient
	from fuzzywuzzy import fuzz
	from urllib.parse import quote_plus
	from PIL import Image
	from io import BytesIO
	from fpdf import FPDF

	# --- Load Keys ---
	load_dotenv()
	OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
	TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
	tavily = TavilyClient(api_key=TAVILY_API_KEY)

	# --- Layout ---
	st.set_page_config("Deep Research Bot", layout="wide")
	with st.sidebar:
	st.title("🧭 Research Input")
	topic = st.text_input("💡 What would you like me to research next?")
	report_type = st.selectbox("📄 Type of report", [
	"Summary - Short and fast (~2 min)",
	"Detailed Report (~5 min)",
	"Thorough Academic Research (~10 min)"
	])
	tone = st.selectbox("🎯 Tone of the report", [
	"Objective - Impartial and unbiased presentation of facts and findings",
	"Persuasive - Advocating a specific point of view",
	"Narrative - Storytelling tone for layperson readers"
	])
	source_type = st.selectbox("🌐 Sources to include", [
	"Web Only", "Academic Only", "Hybrid"
	])
	custom_domains = st.text_input("🔍 Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")

	st.title("🤖 Real-time Deep Research Agent (Tavily Edition)")
	st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time using Tavily, ArXiv, and Semantic Scholar.")

	# --- Helper Functions ---
	def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
	url = "https://openrouter.ai/api/v1/chat/completions"
	headers = {
	"Authorization": f"Bearer {OPENROUTER_API_KEY}",
	"Content-Type": "application/json",
	"X-Title": "GPT Deep Research Agent"
	}
	data = {
	"model": model,
	"messages": messages,
	"max_tokens": max_tokens,
	"temperature": temperature
	}
	response = requests.post(url, headers=headers, json=data)
	result = response.json()
	if response.status_code != 200:
	raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
	return result["choices"][0]["message"]["content"]

	def get_sources(topic, domains=None):
	query = topic
	if domains:
	domain_filters = [d.strip() for d in domains.split(",") if d.strip()]
	query += " site:" + " OR site:".join(domain_filters)

	response = tavily.search(query=query, search_depth="advanced", max_results=10)
	sources = []
	for item in response.get("results", []):
	sources.append({
	"title": item.get("title"),
	"url": item.get("url"),
	"snippet": item.get("content", "")
	})
	return sources

	def get_arxiv_papers(query):
	url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=5"
	feed = feedparser.parse(url)
	return [{
	"title": e.title,
	"summary": e.summary.replace("\n", " ").strip(),
	"url": next((l.href for l in e.links if l.type == "application/pdf"), "")
	} for e in feed.entries]

	def get_semantic_papers(query):
	url = "https://api.semanticscholar.org/graph/v1/paper/search"
	params = {"query": query, "limit": 5, "fields": "title,abstract,url"}
	response = requests.get(url, params=params)
	papers = response.json().get("data", [])
	return [{
	"title": p.get("title"),
	"summary": p.get("abstract", "No abstract available"),
	"url": p.get("url")
	} for p in papers]

	def generate_apa_citation(title, url, source):
	year = datetime.datetime.now().year
	label = {
	"arxiv": "arXiv", "semantic": "Semantic Scholar", "web": "Web Source"
	}.get(source, "Web")
	return f"{title}. ({year}). {label}. {url}"

	def check_plagiarism(text, topic):
	hits = []
	for r in get_sources(topic, ""):
	similarity = fuzz.token_set_ratio(text, r["snippet"])
	if similarity >= 75:
	hits.append(r)
	return hits

	def remove_duplicates(entries):
	unique = []
	titles = []
	for e in entries:
	if all(fuzz.token_set_ratio(e["title"], t) < 85 for t in titles):
	titles.append(e["title"])
	unique.append(e)
	return unique

	def generate_image_from_topic(topic):
	img_prompt = f"Illustration representing '{topic}' in a research or technology context."
	image_url = f"https://source.unsplash.com/featured/?{quote_plus(topic)}"
	return image_url

	def generate_pdf(text):
	pdf = FPDF()
	pdf.add_page()
	pdf.set_auto_page_break(auto=True, margin=15)
	pdf.set_font("Arial", size=12)
	for line in text.split("\n"):
	pdf.multi_cell(0, 10, line)
	buffer = BytesIO()
	pdf.output(buffer)
	buffer.seek(0)
	return buffer

	# --- Execution ---
	if st.button("Research"):
	try:
	with st.spinner("🔍 Gathering relevant research..."):
	all_entries = []
	citations = []

	if source_type in ["Web Only", "Hybrid"]:
	web_data = get_sources(topic, custom_domains)
	web_data = remove_duplicates(web_data)
	for w in web_data:
	all_entries.append({
	"title": w['title'],
	"summary": w['snippet'],
	"url": w['url'],
	"source": "web"
	})
	citations.append(generate_apa_citation(w['title'], w['url'], "web"))

	if source_type in ["Academic Only", "Hybrid"]:
	arxiv_data = get_arxiv_papers(topic)
	semantic_data = get_semantic_papers(topic)
	academic_data = remove_duplicates(arxiv_data + semantic_data)
	for a in academic_data:
	all_entries.append({
	"title": a['title'],
	"summary": a['summary'],
	"url": a['url'],
	"source": "arxiv" if "arxiv" in a['url'] else "semantic"
	})
	citations.append(generate_apa_citation(a['title'], a['url'], a['source']))

	st.success("✅ Data collected and filtered!")

	with st.spinner("🧠 Writing final research report..."):
	sources_text = ""
	for e in all_entries:
	sources_text += f"- [{e['title']}]({e['url']})\n> {e['summary'][:300]}...\n\n"

	prompt = f"""
	# Research Task: {topic}
	Tone: {tone}
	Report Type: {report_type}
	Sources:
	{sources_text}
	Now, synthesize:
	1. Research questions and gap
	2. A novel insight or direction
	3. A real-world application scenario
	4. A {report_type.lower()} in paragraph format (use bullet points only if the paragraph is too long).
	Use larger heading for sections and slightly smaller for sub-sections. Do not use markdown or HTML, just plain text.
	"""
	output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)

	st.header("📄 Research Report")
	st.write(output)

	st.subheader("📚 APA Citations")
	for c in citations:
	st.markdown(f"- {c}")

	with st.spinner("🧪 Checking for overlaps..."):
	overlaps = check_plagiarism(output, topic)
	if overlaps:
	st.warning("⚠️ Potential content overlap found.")
	for h in overlaps:
	st.markdown(f"{h['title']} - [{h['url']}]({h['url']})")
	else:
	st.success("✅ No major overlaps detected.")

	if report_type.startswith("Thorough"):
	st.subheader("🖼️ Related Visual")
	image_url = generate_image_from_topic(topic)
	st.image(image_url, caption=f"Visual related to: {topic}", use_column_width=True)

	st.subheader("📥 Download Options")
	pdf_file = generate_pdf(output)
	st.download_button("📄 Download PDF", data=pdf_file, file_name=f"{topic}_report.pdf", mime="application/pdf")
	st.download_button("📜 Download LaTeX (raw text)", data=output, file_name=f"{topic}_report.tex", mime="text/plain")

	except Exception as e:
	st.error(f"Error: {e}")