Spaces:

QuantumLearner
/

Space12

Sleeping

App Files Files Community

QuantumLearner commited on Aug 18, 2025

Commit

c443f96

verified ·

1 Parent(s): 0feb25a

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -157

app.py CHANGED Viewed

@@ -1,4 +1,9 @@
-import os
 import io
 import uuid
 import asyncio
@@ -10,68 +15,13 @@ import nest_asyncio
 from fpdf import FPDF
 from gpt_researcher import GPTResearcher
-# -------------------------
-# Page & global configuration
-# -------------------------
 st.set_page_config(layout="wide", page_title="GPT Researcher")
-# Base providers & defaults
-os.environ.setdefault("LLM_PROVIDER", "openai")
-os.environ.setdefault("EMBEDDING_PROVIDER", "openai")
-os.environ.setdefault("EMBEDDING_MODEL", "text-embedding-3-small")
-# IMPORTANT: gpt_researcher expects "<provider>:<model>" for SMART_LLM / FAST_LLM / STRATEGIC_LLM
-_provider = os.environ.get("LLM_PROVIDER", "openai")
-_default_strategic = "gpt-4o"
-_default_smart = "gpt-4o-mini"
-# Seed all the variants some releases look for
-def _seed_llm_env(strategic_model: str, smart_model: str, provider: str = _provider):
-    strategic = f"{provider}:{strategic_model}"
-    smart = f"{provider}:{smart_model}"
-    # Required (newer versions check these):
-    os.environ["STRATEGIC_LLM"] = strategic
-    os.environ["SMART_LLM"] = smart
-    os.environ["FAST_LLM"] = smart  # alias some builds use
-    # Back-compat aliases some releases read:
-    os.environ["STRATEGY_LLM"] = strategic
-    os.environ["STRATEGIC_MODEL"] = strategic_model
-    os.environ["SMART_MODEL"] = smart_model
-    # Embeddings (some builds accept both split and combined)
-    os.environ["EMBEDDING"] = f"{os.environ.get('EMBEDDING_PROVIDER','openai')}:{os.environ.get('EMBEDDING_MODEL','text-embedding-3-small')}"
-_seed_llm_env(_default_strategic, _default_smart)
-# Allow asyncio.run inside Streamlit
 nest_asyncio.apply()
-# -------------------------
-# Small helpers
-# -------------------------
-def _apply_model_env(strategic_model: str, smart_model: str):
-    """Apply model choices in the provider-qualified format required by gpt_researcher."""
-    _seed_llm_env(strategic_model, smart_model, provider=os.environ.get("LLM_PROVIDER", "openai"))
-def _clean_logs(text: str) -> str:
-    """Optionally hide noisy lines about unavailable models, keep everything else."""
-    if not text:
-        return text
-    bad_bits = [
-        "The model `o1-preview` does not exist",
-        "`o1-preview` does not exist",
-        "model_not_found",
-    ]
-    lines = []
-    for line in text.splitlines():
-        if any(b in line for b in bad_bits):
-            continue
-        lines.append(line)
-    return "\n".join(lines)
 class PDF(FPDF):
     def header(self):
         self.set_font("Arial", "B", 12)
@@ -83,53 +33,58 @@ class PDF(FPDF):
         self.cell(0, 10, f"Page {self.page_no()}", 0, 0, "C")
 def create_pdf(report_text: str) -> str:
-    """Write PDF to a unique, writable temp path and return the path."""
     pdf_path = f"/tmp/research_report_{uuid.uuid4().hex}.pdf"
     pdf = PDF()
     pdf.add_page()
     pdf.set_auto_page_break(auto=True, margin=15)
     pdf.set_font("Arial", size=12)
-    # FPDF is Latin-1; degrade gracefully
     for line in report_text.split("\n"):
         pdf.multi_cell(0, 10, line.encode("latin-1", "replace").decode("latin-1"))
     pdf.output(pdf_path, "F")
     return pdf_path
-async def run_research_streaming(
-    query: str,
-    report_type: str,
-    report_source: str,
-    sources: list,
-    logs_placeholder
-):
     """
-    Run research and stream stdout to the provided placeholder.
     Returns (report_text, final_logs).
     """
     buf = io.StringIO()
     with redirect_stdout(buf):
-        # For local/doc research, set DOC_PATH and ensure it exists
         if report_source == "local":
-            os.environ["DOC_PATH"] = "./uploads"
-            os.makedirs("uploads", exist_ok=True)
-            researcher = GPTResearcher(query=query, report_type=report_type, report_source=report_source)
         else:
             researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources)
-        # Kick off the task so we can poll logs while it runs
         task = asyncio.create_task(researcher.conduct_research())
-        # Stream logs while the task runs
         while not task.done():
             await asyncio.sleep(0.5)
-            logs_placeholder.code(_clean_logs(buf.getvalue()) or "Starting…")
-        # Ensure exceptions are raised if any
         await task
-        # One final refresh of logs after conduct_research finishes
-        logs_placeholder.code(_clean_logs(buf.getvalue()) or "Finalizing…")
         # Write the report
         report_text = await researcher.write_report()
@@ -137,62 +92,41 @@ async def run_research_streaming(
     final_logs = buf.getvalue()
     return report_text, final_logs
-# -------------------------
-# UI
-# -------------------------
 st.title("GPT Researcher")
 st.markdown(
     """
-GPT Researcher is an autonomous agent for comprehensive online or local-document research,
-producing detailed, factual reports.
 """
 )
 with st.expander("Why Use GPT Researcher?", expanded=False):
     st.markdown(
         """
-- **Objective & Factual**: Focused on accurate information.
-- **Time-Efficient**: Automates the heavy lifting of research.
-- **Up-to-Date**: Pulls from the web or your uploaded files.
-- **Long-Form Reports**: Capable of 2,000+ word outputs.
 """
     )
-# Label styling
-st.markdown(
-    """
-    <style>
-    .big-green-font { font-size:20px !important; font-weight:bold; color:green; margin-bottom:-10px; }
-    .stTextInput > div > input { margin-top:-25px; }
-    </style>
-    """,
-    unsafe_allow_html=True,
 )
-st.markdown('<p class="big-green-font">Enter your research query:</p>', unsafe_allow_html=True)
-default_query = "Why is the Stock Price of Nvidia Soaring?"
-user_query = st.text_input("", default_query, help="Type your research question or topic.")
 current_date = datetime.now().strftime("%B %Y")
 final_query = f"{user_query} Current Date is {current_date}" if user_query else ""
 st.sidebar.title("Research Settings")
-with st.sidebar.expander("How to Use", expanded=False):
-    st.markdown(
-        """
-1. Choose **Web** or **Document** research.
-2. Enter your **query** and pick **report type**.
-3. Provide URLs **or** upload files (for document research).
-4. Click **Run Research** — logs stream live; final report + PDF download appear at the end.
-"""
-    )
 research_type = st.sidebar.selectbox(
     "Select research type:",
     ["Web Research", "Document Research"],
-    help="Choose between web-based research or research from local documents.",
 )
 report_type = st.sidebar.selectbox(
     "Select report type:",
@@ -200,22 +134,7 @@ report_type = st.sidebar.selectbox(
     help="Choose the format of the final report.",
 )
-# Model choices (ensure we never hit `o1-preview`)
-with st.sidebar.expander("Model Settings", expanded=False):
-    strategic_choice = st.selectbox(
-        "Strategic model",
-        ["gpt-4o", "gpt-4o-mini"],
-        index=0,
-        help="Planning/analysis model used by the agent.",
-    )
-    smart_choice = st.selectbox(
-        "Smart model",
-        ["gpt-4o-mini", "gpt-4o"],
-        index=0,
-        help="Cheaper/faster model used by the agent.",
-    )
-# Source inputs
 sources = []
 if research_type == "Web Research":
     sources_input = st.sidebar.text_area(
@@ -228,43 +147,33 @@ else:
     uploaded_files = st.sidebar.file_uploader(
         "Upload files for local research:",
         accept_multiple_files=True,
-        help="Upload documents to analyze.",
     )
     if uploaded_files:
-        os.makedirs("uploads", exist_ok=True)
         for up in uploaded_files:
-            fp = os.path.join("uploads", up.name)
             with open(fp, "wb") as f:
                 f.write(up.getbuffer())
-run_clicked = st.sidebar.button("Run Research", type="primary")
-# Warn if API keys are missing
 if not os.getenv("OPENAI_API_KEY") or not os.getenv("TAVILY_API_KEY"):
     st.error("OPENAI_API_KEY or TAVILY_API_KEY is not set in environment variables.")
-# -------------------------
-# Run the agent (with live logs)
-# -------------------------
 if run_clicked:
     if not user_query:
         st.warning("Please enter a research query.")
     else:
-        # Retriever back-end (Tavily)
         os.environ["RETRIEVER"] = "tavily"
-        # Apply model selections so gpt_researcher gets "<provider>:<model>"
-        _apply_model_env(strategic_choice, smart_choice)
-        # Decide the report source
         report_source = "local" if research_type == "Document Research" else "web"
-        # Live logs area
-        st.subheader("Agent Logs (live)")
         live_logs_placeholder = st.empty()
         with st.spinner("Running research…"):
-            # Stream logs while running
             report_text, final_logs = asyncio.run(
                 run_research_streaming(
                     query=final_query,
@@ -275,18 +184,15 @@ if run_clicked:
                 )
             )
-        # Persist results
         st.session_state["report"] = report_text
         st.session_state["logs"] = final_logs
-# -------------------------
-# Show results (if any)
-# -------------------------
 if "report" in st.session_state:
     st.markdown("### Research Report")
     st.markdown(st.session_state["report"])
-    # Create & offer PDF download
     try:
         pdf_path = create_pdf(st.session_state["report"])
         with open(pdf_path, "rb") as pdf_file:
@@ -299,16 +205,15 @@ if "report" in st.session_state:
     except Exception as e:
         st.warning(f"Could not generate PDF: {e}")
-# Final logs snapshot (separate from the live stream above)
-st.markdown("### Agent Logs")
 st.text_area(
-    "Logs will appear here during/after the research process:",
-    value=_clean_logs(st.session_state.get("logs", "")),
     height=220,
-    key=f"logs_{uuid.uuid4()}",
 )
-# Hide default Streamlit footer & menu
 st.markdown(
     """
     <style>

+# --- set a writable doc path BEFORE importing gpt_researcher ---
+import os as _os
+_os.environ.setdefault("DOC_PATH", "/app/uploads")  # or "/tmp/my-docs"
+_os.makedirs(_os.environ["DOC_PATH"], exist_ok=True)
+# ---------------------------------------------------------------
 import io
 import uuid
 import asyncio
 from fpdf import FPDF
 from gpt_researcher import GPTResearcher
+# Streamlit page config
 st.set_page_config(layout="wide", page_title="GPT Researcher")
+# Allow asyncio.run in Streamlit
 nest_asyncio.apply()
+# -------- PDF helper --------
 class PDF(FPDF):
     def header(self):
         self.set_font("Arial", "B", 12)
         self.cell(0, 10, f"Page {self.page_no()}", 0, 0, "C")
 def create_pdf(report_text: str) -> str:
+    """Write PDF to a unique temp path and return the path."""
     pdf_path = f"/tmp/research_report_{uuid.uuid4().hex}.pdf"
     pdf = PDF()
     pdf.add_page()
     pdf.set_auto_page_break(auto=True, margin=15)
     pdf.set_font("Arial", size=12)
     for line in report_text.split("\n"):
+        # FPDF is latin-1: degrade gracefully
         pdf.multi_cell(0, 10, line.encode("latin-1", "replace").decode("latin-1"))
     pdf.output(pdf_path, "F")
     return pdf_path
+# -------- live research runner --------
+async def run_research_streaming(query: str, report_type: str, report_source: str, sources: list, logs_placeholder):
     """
+    Run research and stream stdout to logs_placeholder.
     Returns (report_text, final_logs).
     """
     buf = io.StringIO()
     with redirect_stdout(buf):
         if report_source == "local":
+            # ensure DOC_PATH exists (already set before import, but keep it safe)
+            os.makedirs(os.environ["DOC_PATH"], exist_ok=True)
+            researcher = GPTResearcher(query=query, report_type=report_type, report_source="local")
         else:
             researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources)
+        # Start research so we can poll logs
         task = asyncio.create_task(researcher.conduct_research())
+        # Stream logs while running
         while not task.done():
             await asyncio.sleep(0.5)
+            logs = buf.getvalue() or "Starting…"
+            logs_placeholder.text_area(
+                "Agent Logs (live)",
+                value=logs,
+                height=220,
+                key=f"live_logs_{uuid.uuid4()}",
+            )
+        # Propagate exceptions if any
         await task
+        # Final logs refresh
+        logs_placeholder.text_area(
+            "Agent Logs (live)",
+            value=buf.getvalue() or "Finalizing…",
+            height=220,
+            key=f"live_logs_final_{uuid.uuid4()}",
+        )
         # Write the report
         report_text = await researcher.write_report()
     final_logs = buf.getvalue()
     return report_text, final_logs
+# ---------------- UI ----------------
 st.title("GPT Researcher")
 st.markdown(
     """
+GPT Researcher is an autonomous agent for web/doc research that produces a detailed, factual report.
 """
 )
 with st.expander("Why Use GPT Researcher?", expanded=False):
     st.markdown(
         """
+- **Objective & Factual**
+- **Time-Efficient**
+- **Up-to-Date** (web or uploaded docs)
+- **Long-Form Reports** (2,000+ words possible)
 """
     )
+# Input label with accessibility (hide visually but not empty)
+user_query = st.text_input(
+    "Research query",
+    "Why is the Stock Price of Nvidia Soaring?",
+    help="Type your research question or topic.",
+    label_visibility="collapsed",
 )
 current_date = datetime.now().strftime("%B %Y")
 final_query = f"{user_query} Current Date is {current_date}" if user_query else ""
 st.sidebar.title("Research Settings")
 research_type = st.sidebar.selectbox(
     "Select research type:",
     ["Web Research", "Document Research"],
+    help="Choose web-based research or research from local documents.",
 )
 report_type = st.sidebar.selectbox(
     "Select report type:",
     help="Choose the format of the final report.",
 )
+# Sources / uploads
 sources = []
 if research_type == "Web Research":
     sources_input = st.sidebar.text_area(
     uploaded_files = st.sidebar.file_uploader(
         "Upload files for local research:",
         accept_multiple_files=True,
+        help=f"Files are saved to {os.environ['DOC_PATH']}",
     )
     if uploaded_files:
         for up in uploaded_files:
+            fp = os.path.join(os.environ["DOC_PATH"], up.name)
             with open(fp, "wb") as f:
                 f.write(up.getbuffer())
+# Keys check (optional UI hint)
 if not os.getenv("OPENAI_API_KEY") or not os.getenv("TAVILY_API_KEY"):
     st.error("OPENAI_API_KEY or TAVILY_API_KEY is not set in environment variables.")
+run_clicked = st.sidebar.button("Run Research", type="primary")
 if run_clicked:
     if not user_query:
         st.warning("Please enter a research query.")
     else:
+        # Use Tavily retriever (what the original app did)
         os.environ["RETRIEVER"] = "tavily"
         report_source = "local" if research_type == "Document Research" else "web"
+        st.subheader("Agent Logs")
         live_logs_placeholder = st.empty()
         with st.spinner("Running research…"):
             report_text, final_logs = asyncio.run(
                 run_research_streaming(
                     query=final_query,
                 )
             )
         st.session_state["report"] = report_text
         st.session_state["logs"] = final_logs
+# ------------- Results -------------
 if "report" in st.session_state:
     st.markdown("### Research Report")
     st.markdown(st.session_state["report"])
+    # Create & offer PDF
     try:
         pdf_path = create_pdf(st.session_state["report"])
         with open(pdf_path, "rb") as pdf_file:
     except Exception as e:
         st.warning(f"Could not generate PDF: {e}")
+st.markdown("### Agent Logs (final)")
 st.text_area(
+    "Logs snapshot after run:",
+    value=st.session_state.get("logs", ""),
     height=220,
+    key=f"logs_snapshot_{uuid.uuid4()}",
 )
+# Hide Streamlit footer & menu
 st.markdown(
     """
     <style>