Spaces:

Ani14
/

AutoReasearcher

Sleeping

App Files Files Community

Ani14 commited on Apr 21, 2025

Commit

05644a0

verified ·

1 Parent(s): bd2e62c

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -86

app.py CHANGED Viewed

@@ -7,21 +7,42 @@ from tavily import TavilyClient
 import feedparser
 import time
 from fuzzywuzzy import fuzz
 from fpdf import FPDF
 # Load environment variables
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
-TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
 tavily = TavilyClient(api_key=TAVILY_API_KEY)
 # --- Helper Functions ---
 def get_sources(topic, domains=None):
     query = topic
     if domains:
         domain_filters = [d.strip() for d in domains.split(",") if d.strip()]
         query += " site:" + " OR site:".join(domain_filters)
     response = tavily.search(query=query, search_depth="advanced", max_results=10)
     sources = []
     for item in response.get("results", []):
@@ -32,27 +53,84 @@ def get_sources(topic, domains=None):
         })
     return sources
-def merge_duplicates(data):
-    seen_titles = {}
-    unique_data = []
-    for item in data:
-        title = item['title']
-        is_duplicate = False
-        for seen_title in seen_titles:
-            if fuzz.ratio(title.lower(), seen_title.lower()) > 85:
-                is_duplicate = True
-                break
-        if not is_duplicate:
-            seen_titles[title] = True
-            unique_data.append(item)
-    return unique_data
 # --- Streamlit UI ---
 st.set_page_config("Deep Research Bot", layout="wide")
 with st.sidebar:
-    st.header("🧪 Research Configuration")
-    topic = st.text_input("💡 What would you like me to research next?")
     report_type = st.selectbox("📄 Type of report", [
         "Summary - Short and fast (~2 min)",
         "Detailed Report (~5 min)",
@@ -63,81 +141,76 @@ with st.sidebar:
         "Persuasive - Advocating a specific point of view",
         "Narrative - Storytelling tone for layperson readers"
     ])
-    source_type = st.selectbox("🌐 Sources to include", [
-        "Web Only", "Academic Only", "Hybrid"
-    ])
     custom_domains = st.text_input("🔍 Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
-    research_triggered = st.button("🔎 Start Research")
-st.title("🤖 Real-time Deep Research Agent (Tavily Edition)")
-st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time using Tavily.")
-if research_triggered:
     try:
-        with st.status("Starting agent tasks..."):
-            st.info("🧠 Thinking through research questions...")
-            time.sleep(1)
-            st.info("🌐 Fetching data from selected sources...")
-            raw_data = []
             citations = []
             if source_type in ["Web Only", "Hybrid"]:
-                web = get_sources(topic, custom_domains)
-                raw_data.extend(web)
-                for w in web:
-                    citations.append(generate_apa_citation(w["title"], w["url"], "web"))
             if source_type in ["Academic Only", "Hybrid"]:
-                arxiv = get_arxiv_papers(topic)
-                scholar = get_semantic_papers(topic)
-                raw_data.extend(arxiv)
-                raw_data.extend(scholar)
-                for p in arxiv:
-                    citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
-                for s in scholar:
-                    citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
-            # Merge duplicates before formatting
-            filtered_data = merge_duplicates(raw_data)
-            all_data = ""
-            for item in filtered_data:
-                summary = item.get("snippet") or item.get("summary", "")
-                all_data += f"- [{item['title']}]({item['url']})\n> {summary[:300]}...\n\n"
-            st.success("Data collection complete!")
-        with st.spinner("📝 Writing final research report..."):
-            prompt = generate_prompt(report_type, tone, topic, all_data)
-            output = call_llm([{"role": "user", "content": prompt}], max_tokens=4000)
-        st.markdown("## 📄 Research Report")
-        st.markdown(f"<div style='font-size:16px;'>{output}</div>", unsafe_allow_html=True)
-        if "Detailed" in report_type or "Thorough" in report_type:
-            st.markdown("## 📚 APA Citations")
-            for c in citations:
-                st.markdown(f"- {c}")
-        if "Thorough" in report_type:
-            image_links = get_image_links(topic)
-            if image_links:
-                st.markdown("## 🖼️ Related Visuals")
-                for img in image_links:
-                    st.image(img, use_column_width=True)
-            with st.spinner("🧪 Checking for overlaps..."):
-                overlaps = check_plagiarism(output, topic)
-                if overlaps:
-                    st.warning("⚠️ Potential content overlap found.")
-                    for h in overlaps:
-                        st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
-                else:
-                    st.success("✅ No major overlaps detected.")
-            pdf_path = save_pdf(output)
-            with open(pdf_path, "rb") as pdf_file:
-                st.download_button("📄 Download PDF", pdf_file, file_name="research_report.pdf")
-            st.download_button("📄 Download LaTeX", output.encode("utf-8"), file_name="research_report.tex")
     except Exception as e:
-        st.error(f"Error: {e}")

 import feedparser
 import time
 from fuzzywuzzy import fuzz
+from PIL import Image
+from io import BytesIO
 from fpdf import FPDF
+import base64
 # Load environment variables
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
+TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 tavily = TavilyClient(api_key=TAVILY_API_KEY)
 # --- Helper Functions ---
+def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=3500, temperature=0.7):
+    url = "https://openrouter.ai/api/v1/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+        "Content-Type": "application/json",
+        "X-Title": "GPT Deep Research Agent"
+    }
+    data = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature
+    }
+    response = requests.post(url, headers=headers, json=data)
+    result = response.json()
+    if response.status_code != 200:
+        raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
+    return result["choices"][0]["message"]["content"]
 def get_sources(topic, domains=None):
     query = topic
     if domains:
         domain_filters = [d.strip() for d in domains.split(",") if d.strip()]
         query += " site:" + " OR site:".join(domain_filters)
     response = tavily.search(query=query, search_depth="advanced", max_results=10)
     sources = []
     for item in response.get("results", []):
         })
     return sources
+def get_arxiv_papers(query):
+    from urllib.parse import quote_plus
+    url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=5"
+    feed = feedparser.parse(url)
+    return [{
+        "title": e.title,
+        "summary": e.summary.replace("\n", " ").strip(),
+        "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
+    } for e in feed.entries]
+def get_semantic_papers(query):
+    url = "https://api.semanticscholar.org/graph/v1/paper/search"
+    params = {"query": query, "limit": 5, "fields": "title,abstract,url"}
+    response = requests.get(url, params=params)
+    papers = response.json().get("data", [])
+    return [{
+        "title": p.get("title"),
+        "summary": p.get("abstract", "No abstract available"),
+        "url": p.get("url")
+    } for p in papers]
+def check_plagiarism(text, topic):
+    hits = []
+    for r in get_sources(topic):
+        similarity = fuzz.token_set_ratio(text, r["snippet"])
+        if similarity >= 75:
+            hits.append(r)
+    return hits
+def generate_apa_citation(title, url, source):
+    year = datetime.datetime.now().year
+    label = {
+        "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
+    }.get(source, "*Web*")
+    return f"{title}. ({year}). {label}. {url}"
+def merge_duplicates(entries):
+    unique = []
+    seen_titles = []
+    for entry in entries:
+        if all(fuzz.token_set_ratio(entry['title'], seen) < 90 for seen in seen_titles):
+            unique.append(entry)
+            seen_titles.append(entry['title'])
+    return unique
+def generate_pdf(text):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_auto_page_break(auto=True, margin=15)
+    pdf.set_font("Arial", size=12)
+    for line in text.split('\n'):
+        pdf.multi_cell(0, 10, line)
+    pdf_output = BytesIO()
+    pdf.output(pdf_output)
+    pdf_output.seek(0)
+    return pdf_output
+def generate_latex(text):
+    latex = "\\documentclass{article}\n\\usepackage{hyperref}\n\\begin{document}\n"
+    for line in text.split('\n'):
+        latex += line.replace('_', '\\_') + "\\\\\n"
+    latex += "\\end{document}"
+    return BytesIO(latex.encode("utf-8"))
+def generate_download_button(file, label, mime_type):
+    b64 = base64.b64encode(file.read()).decode()
+    return f"""
+        <a href="data:{mime_type};base64,{b64}" download="{label}">
+            📥 Download {label}
+        </a>
+    """
 # --- Streamlit UI ---
 st.set_page_config("Deep Research Bot", layout="wide")
 with st.sidebar:
+    st.title("🧠 Deep Research Assistant")
+    topic = st.text_input("💡 Topic to research")
     report_type = st.selectbox("📄 Type of report", [
         "Summary - Short and fast (~2 min)",
         "Detailed Report (~5 min)",
         "Persuasive - Advocating a specific point of view",
         "Narrative - Storytelling tone for layperson readers"
     ])
+    source_type = st.selectbox("🌐 Sources to include", ["Web Only", "Academic Only", "Hybrid"])
     custom_domains = st.text_input("🔍 Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
+    research_button = st.button("Research")
+st.title("📑 Research Output")
+if research_button and topic:
     try:
+        with st.status("🔍 Gathering data..."):
+            st.info("Fetching from sources...")
+            all_sources = []
             citations = []
             if source_type in ["Web Only", "Hybrid"]:
+                web_data = get_sources(topic, custom_domains)
+                for item in web_data:
+                    all_sources.append(item | {"source": "web"})
             if source_type in ["Academic Only", "Hybrid"]:
+                arxiv_data = get_arxiv_papers(topic)
+                for item in arxiv_data:
+                    all_sources.append(item | {"source": "arxiv"})
+                semantic_data = get_semantic_papers(topic)
+                for item in semantic_data:
+                    all_sources.append(item | {"source": "semantic"})
+            merged = merge_duplicates(all_sources)
+            combined_text = ""
+            for m in merged:
+                combined_text += f"- [{m['title']}]({m['url']})\n> {m.get('snippet', m.get('summary', ''))[:300]}...\n\n"
+                citations.append(generate_apa_citation(m['title'], m['url'], m['source']))
+        with st.spinner("✍️ Synthesizing report..."):
+            prompt = f"""
+# Research Topic: {topic}
+Tone: {tone}
+Type: {report_type}
+Sources:
+{combined_text}
+Write the report in academic markdown with paragraphs (use bullet points only when necessary). Include:
+1. Introduction
+2. Research Gap
+3. Novel Insight
+4. Application
+5. Full Academic Writeup if Thorough Report
+            """
+            final_output = call_llm([{"role": "user", "content": prompt}])
+        st.markdown(f"### 📄 {report_type}")
+        st.markdown(final_output, unsafe_allow_html=True)
+        st.markdown("### 📚 Citations (APA Format)")
+        for cite in citations:
+            st.markdown(f"- {cite}")
+        if report_type == "Thorough Academic Research (~10 min)":
+            with st.spinner("📦 Preparing PDF and LaTeX..."):
+                pdf_file = generate_pdf(final_output)
+                latex_file = generate_latex(final_output)
+                st.markdown(generate_download_button(pdf_file, "Research_Report.pdf", "application/pdf"), unsafe_allow_html=True)
+                st.markdown(generate_download_button(latex_file, "Research_Report.tex", "application/x-latex"), unsafe_allow_html=True)
+        overlaps = check_plagiarism(final_output, topic)
+        if overlaps:
+            st.warning("⚠️ Potential overlaps detected:")
+            for hit in overlaps:
+                st.markdown(f"- [{hit['title']}]({hit['url']})")
+        else:
+            st.success("✅ No major overlaps found.")
     except Exception as e:
+        st.error(f"Error: {e}")