Spaces:

Ani14
/

AutoReasearcher

Sleeping

App Files Files Community

Ani14 commited on Apr 21, 2025

Commit

bd2e62c

verified ·

1 Parent(s): 27f01b8

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -182

app.py CHANGED Viewed

@@ -2,26 +2,56 @@ import os
 import streamlit as st
 import requests
 import datetime
-import feedparser
-import time
 from dotenv import load_dotenv
 from tavily import TavilyClient
 from fuzzywuzzy import fuzz
-from urllib.parse import quote_plus
-from PIL import Image
-from io import BytesIO
 from fpdf import FPDF
-# --- Load Keys ---
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
 tavily = TavilyClient(api_key=TAVILY_API_KEY)
-# --- Layout ---
 st.set_page_config("Deep Research Bot", layout="wide")
 with st.sidebar:
-    st.title("🧭 Research Input")
     topic = st.text_input("💡 What would you like me to research next?")
     report_type = st.selectbox("📄 Type of report", [
         "Summary - Short and fast (~2 min)",
@@ -37,186 +67,77 @@ with st.sidebar:
         "Web Only", "Academic Only", "Hybrid"
     ])
     custom_domains = st.text_input("🔍 Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
 st.title("🤖 Real-time Deep Research Agent (Tavily Edition)")
-st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time using Tavily, ArXiv, and Semantic Scholar.")
-# --- Helper Functions ---
-def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
-    url = "https://openrouter.ai/api/v1/chat/completions"
-    headers = {
-        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
-        "Content-Type": "application/json",
-        "X-Title": "GPT Deep Research Agent"
-    }
-    data = {
-        "model": model,
-        "messages": messages,
-        "max_tokens": max_tokens,
-        "temperature": temperature
-    }
-    response = requests.post(url, headers=headers, json=data)
-    result = response.json()
-    if response.status_code != 200:
-        raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
-    return result["choices"][0]["message"]["content"]
-def get_sources(topic, domains=None):
-    query = topic
-    if domains:
-        domain_filters = [d.strip() for d in domains.split(",") if d.strip()]
-        query += " site:" + " OR site:".join(domain_filters)
-    response = tavily.search(query=query, search_depth="advanced", max_results=10)
-    sources = []
-    for item in response.get("results", []):
-        sources.append({
-            "title": item.get("title"),
-            "url": item.get("url"),
-            "snippet": item.get("content", "")
-        })
-    return sources
-def get_arxiv_papers(query):
-    url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=5"
-    feed = feedparser.parse(url)
-    return [{
-        "title": e.title,
-        "summary": e.summary.replace("\n", " ").strip(),
-        "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
-    } for e in feed.entries]
-def get_semantic_papers(query):
-    url = "https://api.semanticscholar.org/graph/v1/paper/search"
-    params = {"query": query, "limit": 5, "fields": "title,abstract,url"}
-    response = requests.get(url, params=params)
-    papers = response.json().get("data", [])
-    return [{
-        "title": p.get("title"),
-        "summary": p.get("abstract", "No abstract available"),
-        "url": p.get("url")
-    } for p in papers]
-def generate_apa_citation(title, url, source):
-    year = datetime.datetime.now().year
-    label = {
-        "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
-    }.get(source, "*Web*")
-    return f"{title}. ({year}). {label}. {url}"
-def check_plagiarism(text, topic):
-    hits = []
-    for r in get_sources(topic, ""):
-        similarity = fuzz.token_set_ratio(text, r["snippet"])
-        if similarity >= 75:
-            hits.append(r)
-    return hits
-def remove_duplicates(entries):
-    unique = []
-    titles = []
-    for e in entries:
-        if all(fuzz.token_set_ratio(e["title"], t) < 85 for t in titles):
-            titles.append(e["title"])
-            unique.append(e)
-    return unique
-def generate_image_from_topic(topic):
-    img_prompt = f"Illustration representing '{topic}' in a research or technology context."
-    image_url = f"https://source.unsplash.com/featured/?{quote_plus(topic)}"
-    return image_url
-def generate_pdf(text):
-    pdf = FPDF()
-    pdf.add_page()
-    pdf.set_auto_page_break(auto=True, margin=15)
-    pdf.set_font("Arial", size=12)
-    for line in text.split("\n"):
-        pdf.multi_cell(0, 10, line)
-    buffer = BytesIO()
-    pdf.output(buffer)
-    buffer.seek(0)
-    return buffer
-# --- Execution ---
-if st.button("Research"):
     try:
-        with st.spinner("🔍 Gathering relevant research..."):
-            all_entries = []
-            citations = []
             if source_type in ["Web Only", "Hybrid"]:
-                web_data = get_sources(topic, custom_domains)
-                web_data = remove_duplicates(web_data)
-                for w in web_data:
-                    all_entries.append({
-                        "title": w['title'],
-                        "summary": w['snippet'],
-                        "url": w['url'],
-                        "source": "web"
-                    })
-                    citations.append(generate_apa_citation(w['title'], w['url'], "web"))
             if source_type in ["Academic Only", "Hybrid"]:
-                arxiv_data = get_arxiv_papers(topic)
-                semantic_data = get_semantic_papers(topic)
-                academic_data = remove_duplicates(arxiv_data + semantic_data)
-                for a in academic_data:
-                    all_entries.append({
-                        "title": a['title'],
-                        "summary": a['summary'],
-                        "url": a['url'],
-                        "source": "arxiv" if "arxiv" in a['url'] else "semantic"
-                    })
-                    citations.append(generate_apa_citation(a['title'], a['url'], a['source']))
-        st.success("✅ Data collected and filtered!")
-        with st.spinner("🧠 Writing final research report..."):
-            sources_text = ""
-            for e in all_entries:
-                sources_text += f"- [{e['title']}]({e['url']})\n> {e['summary'][:300]}...\n\n"
-            prompt = f"""
-# Research Task: {topic}
-Tone: {tone}
-Report Type: {report_type}
-Sources:
-{sources_text}
-Now, synthesize:
-1. Research questions and gap
-2. A novel insight or direction
-3. A real-world application scenario
-4. A {report_type.lower()} in paragraph format (use bullet points only if the paragraph is too long).
-Use larger heading for sections and slightly smaller for sub-sections. Do not use markdown or HTML, just plain text.
-            """
-            output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
-        st.header("📄 Research Report")
-        st.write(output)
-        st.subheader("📚 APA Citations")
-        for c in citations:
-            st.markdown(f"- {c}")
-        with st.spinner("🧪 Checking for overlaps..."):
-            overlaps = check_plagiarism(output, topic)
-            if overlaps:
-                st.warning("⚠️ Potential content overlap found.")
-                for h in overlaps:
-                    st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
-            else:
-                st.success("✅ No major overlaps detected.")
-        if report_type.startswith("Thorough"):
-            st.subheader("🖼️ Related Visual")
-            image_url = generate_image_from_topic(topic)
-            st.image(image_url, caption=f"Visual related to: {topic}", use_column_width=True)
-            st.subheader("📥 Download Options")
-            pdf_file = generate_pdf(output)
-            st.download_button("📄 Download PDF", data=pdf_file, file_name=f"{topic}_report.pdf", mime="application/pdf")
-            st.download_button("📜 Download LaTeX (raw text)", data=output, file_name=f"{topic}_report.tex", mime="text/plain")
     except Exception as e:
-        st.error(f"Error: {e}")

 import streamlit as st
 import requests
 import datetime
 from dotenv import load_dotenv
 from tavily import TavilyClient
+import feedparser
+import time
 from fuzzywuzzy import fuzz
 from fpdf import FPDF
+# Load environment variables
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
 tavily = TavilyClient(api_key=TAVILY_API_KEY)
+# --- Helper Functions ---
+def get_sources(topic, domains=None):
+    query = topic
+    if domains:
+        domain_filters = [d.strip() for d in domains.split(",") if d.strip()]
+        query += " site:" + " OR site:".join(domain_filters)
+    response = tavily.search(query=query, search_depth="advanced", max_results=10)
+    sources = []
+    for item in response.get("results", []):
+        sources.append({
+            "title": item.get("title"),
+            "url": item.get("url"),
+            "snippet": item.get("content", "")
+        })
+    return sources
+def merge_duplicates(data):
+    seen_titles = {}
+    unique_data = []
+    for item in data:
+        title = item['title']
+        is_duplicate = False
+        for seen_title in seen_titles:
+            if fuzz.ratio(title.lower(), seen_title.lower()) > 85:
+                is_duplicate = True
+                break
+        if not is_duplicate:
+            seen_titles[title] = True
+            unique_data.append(item)
+    return unique_data
+# --- Streamlit UI ---
 st.set_page_config("Deep Research Bot", layout="wide")
 with st.sidebar:
+    st.header("🧪 Research Configuration")
     topic = st.text_input("💡 What would you like me to research next?")
     report_type = st.selectbox("📄 Type of report", [
         "Summary - Short and fast (~2 min)",
         "Web Only", "Academic Only", "Hybrid"
     ])
     custom_domains = st.text_input("🔍 Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
+    research_triggered = st.button("🔎 Start Research")
 st.title("🤖 Real-time Deep Research Agent (Tavily Edition)")
+st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time using Tavily.")
+if research_triggered:
     try:
+        with st.status("Starting agent tasks..."):
+            st.info("🧠 Thinking through research questions...")
+            time.sleep(1)
+            st.info("🌐 Fetching data from selected sources...")
+            raw_data = []
+            citations = []
             if source_type in ["Web Only", "Hybrid"]:
+                web = get_sources(topic, custom_domains)
+                raw_data.extend(web)
+                for w in web:
+                    citations.append(generate_apa_citation(w["title"], w["url"], "web"))
             if source_type in ["Academic Only", "Hybrid"]:
+                arxiv = get_arxiv_papers(topic)
+                scholar = get_semantic_papers(topic)
+                raw_data.extend(arxiv)
+                raw_data.extend(scholar)
+                for p in arxiv:
+                    citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
+                for s in scholar:
+                    citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
+            # Merge duplicates before formatting
+            filtered_data = merge_duplicates(raw_data)
+            all_data = ""
+            for item in filtered_data:
+                summary = item.get("snippet") or item.get("summary", "")
+                all_data += f"- [{item['title']}]({item['url']})\n> {summary[:300]}...\n\n"
+            st.success("Data collection complete!")
+        with st.spinner("📝 Writing final research report..."):
+            prompt = generate_prompt(report_type, tone, topic, all_data)
+            output = call_llm([{"role": "user", "content": prompt}], max_tokens=4000)
+        st.markdown("## 📄 Research Report")
+        st.markdown(f"<div style='font-size:16px;'>{output}</div>", unsafe_allow_html=True)
+        if "Detailed" in report_type or "Thorough" in report_type:
+            st.markdown("## 📚 APA Citations")
+            for c in citations:
+                st.markdown(f"- {c}")
+        if "Thorough" in report_type:
+            image_links = get_image_links(topic)
+            if image_links:
+                st.markdown("## 🖼️ Related Visuals")
+                for img in image_links:
+                    st.image(img, use_column_width=True)
+            with st.spinner("🧪 Checking for overlaps..."):
+                overlaps = check_plagiarism(output, topic)
+                if overlaps:
+                    st.warning("⚠️ Potential content overlap found.")
+                    for h in overlaps:
+                        st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
+                else:
+                    st.success("✅ No major overlaps detected.")
+            pdf_path = save_pdf(output)
+            with open(pdf_path, "rb") as pdf_file:
+                st.download_button("📄 Download PDF", pdf_file, file_name="research_report.pdf")
+            st.download_button("📄 Download LaTeX", output.encode("utf-8"), file_name="research_report.tex")
     except Exception as e:
+        st.error(f"Error: {e}")