Spaces:

10gen
/

deepsearchitv2

Runtime error

App Files Files Community

Guiyom commited on Feb 25, 2025

Commit

cf84868

verified ·

1 Parent(s): 073e11f

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -46

app.py CHANGED Viewed

@@ -1103,6 +1103,91 @@ def get_random_header():
     ]
     return random.choice(headers)
 def clean_content(raw_content: str) -> str:
     # Parse HTML using BeautifulSoup (if not HTML, it will safely return the text)
     soup = BeautifulSoup(raw_content, "html.parser")
@@ -1227,12 +1312,7 @@ def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: floa
         snippet = summarize_large_text(snippet, target_length=2000, chunk_size=1000, overlap=200)
         snippet_words = len(snippet.split())
-    # Decide a proportional dynamic token count (for reference; not used to limit the API call below)
-    dynamic_tokens = min(3000, max(250, int(snippet_words * 0.5)))
     client = os.getenv('OPENAI_API_KEY')  # alternatively, pass your API key here if needed.
-    # (Assuming you use a client instance from your OpenAI library elsewhere.)
-    # Here, we assume that openai.OpenAI(api_key=...) is wrapped by openai_call.
     prompt = (f"""Analyze the following content from a query result:
@@ -1248,7 +1328,7 @@ Instructions:
     -   Key Facts (at least 5): List the core factual claims using short, declarative sentences or bullet points. Apply lemmatization and standard abbreviations.
     -   Key Figures (at least 5): Extract numerical data (statistics, dates, percentages) and include any necessary context (units, references, explanations) required to interpret these numbers. Present them concisely (list or table format).
     -   Key Arguments (at least 5): Identify main arguments or claims. Summarize supporting evidence and counter-arguments concisely.
-    -   Key Quotes (at least 1 if any): Include significant quotes (with the author's name in parentheses). Attribute quotes correctly. Paraphrase if needed, indicating that it’s a paraphrase. Use symbols (e.g., &, +, ->, =) to conserve tokens.
     -   Structured Summary (10 to 50 sentences): Provide a structured summary that includes anecdotes, people, and locations to ensure the report is relatable.
 Note: General Optimization Guidelines:
@@ -1258,40 +1338,88 @@ Note: General Optimization Guidelines:
     -   Shorten words carefully (e.g., "information" -> "info") without causing ambiguity.
     -   Use symbols where appropriate.
-3.  Follow-up Search Queries: Generate at least {breadth} follow-up search queries relevant to the research topic and the summarized content. Use search operators (AND, OR, quotation marks) as needed. Output the queries as a JSON list of strings (e.g., ["query1", "query2", ...]) with no additional formatting, extra text, or markdown (do not include the word "python" anywhere).
 4. Ensure that the summary length and level of detail is proportional to the source length.
 Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
-**Output Requirement: Output the queries as a JSON list of strings (e.g., ["query1", "query2", ...]) with no additional formatting, extra text, or markdown (No mention of the coding language ex:"python" or "html" anywhere before the result).
-Proceed."""
     )
     try:
         response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
-        res_text = response.strip()
-        if not res_text:
             logging.error("analyze_with_gpt4o: Empty response received from API.")
             return {"relevant": "no", "summary": "", "followups": []}
-        # Remove Markdown code fences if present
-        if res_text.startswith("```"):
-            res_text = re.sub(r"^```(json)?", "", res_text)
-            res_text = re.sub(r"```$", "", res_text).strip()
-        res_text = res_text.strip().strip("```").strip()
-        # Optionally remove any start/end markers like "json" if present:
-        if res_text.lower().startswith("json"):
-            res_text = res_text[4:].strip()
-        try:
-            result = json.loads(res_text)
-        except json.JSONDecodeError as je:
-            logging.error(f"analyze_with_gpt4o: JSON decode error: {je}. Raw response: '{res_text}'")
-            return {"relevant": "no", "summary": "", "followups": []}
-        except json.JSONDecodeError as je:
-            logging.error(f"analyze_with_gpt4o: JSON decode error: {je}. Raw response: '{res_text}'")
-            return {"relevant": "no", "summary": "", "followups": []}
-        logging.info(f"analyze_with_gpt4o: snippet analysis result: {result}")
-        return result
     except Exception as e:
         logging.error(f"analyze_with_gpt4o error: {e}")
         return {"relevant": "no", "summary": "", "followups": []}
@@ -2258,26 +2386,19 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
                 title = res.get("title", "No Title")
                 if not url:
                     continue
-                if url.lower().endswith(".pdf"):
-                    raw_content = process_pdf(url)
-                    if "Error processing PDF" in raw_content:
-                        continue
-                else:
-                    try:
-                        headers = {"User-Agent": get_random_header()}
-                        response = requests.get(url, headers=headers)
-                        response.raise_for_status()
-                        raw_content = response.text
-                        process_log += f"Extracted full page content from {url}\n"
-                    except Exception as e:
-                        logging.error(f"Error retrieving content from {url}: {e}")
-                        process_log += f"Error retrieving content from {url}: {e}\n"
-                        continue
                 # Skip processing if raw_content is empty or too short (< 1000 characters)
-                if not raw_content or len(raw_content) < 1000 or "could not be extracted" in raw_content.lower() or "error" in raw_content.lower():
                     process_log += f"Content from {url} is either an error or too short (<1000 characters), skipping.\n"
                     continue
                 # 1) Clean and do minimal parse
                 cleaned_html = clean_content(raw_content)
                 # 2) Extract structured data

     ]
     return random.choice(headers)
+def process_url(url: str, retries: int = 3, timeout: int = 15) -> str:
+    """
+    Process a URL with multiple user agents and retries to handle 403 errors
+    and other common web scraping issues.
+    Args:
+        url: The URL to retrieve content from
+        retries: Number of retry attempts (with different user agents)
+        timeout: Connection timeout in seconds
+    Returns:
+        The page content as a string, or an error message
+    """
+    if url.lower().endswith(".pdf"):
+        return process_pdf(url)
+    user_agents = [
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+        "Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1"
+    ]
+    referrers = [
+        "https://www.google.com/",
+        "https://www.bing.com/",
+        "https://search.yahoo.com/",
+        "https://duckduckgo.com/",
+        "https://www.baidu.com/"
+    ]
+    for attempt in range(retries):
+        try:
+            # Choose a different user agent and referrer for each attempt
+            headers = {
+                "User-Agent": user_agents[attempt % len(user_agents)],
+                "Referer": referrers[attempt % len(referrers)],
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.5",
+                "Accept-Encoding": "gzip, deflate, br",
+                "Connection": "keep-alive",
+                "Upgrade-Insecure-Requests": "1",
+                "Cache-Control": "max-age=0"
+            }
+            # Add a delay between attempts that increases with each retry
+            if attempt > 0:
+                delay = random.uniform(2, 5) * attempt
+                logging.info(f"Retry {attempt+1}/{retries} for {url} - waiting {delay:.1f} seconds")
+                time.sleep(delay)
+            response = requests.get(url, headers=headers, timeout=timeout)
+            if response.status_code == 200:
+                logging.info(f"Successfully retrieved content from {url} on attempt {attempt+1}")
+                return response.text
+            elif response.status_code == 403:
+                logging.warning(f"403 Forbidden on attempt {attempt+1} for {url}, trying different user agent")
+                continue
+            elif response.status_code == 404:
+                return f"Error: Page not found (404) for {url}"
+            else:
+                response.raise_for_status()
+        except requests.exceptions.Timeout:
+            logging.warning(f"Timeout on attempt {attempt+1} for {url}")
+            if attempt == retries - 1:
+                return f"Error: Timeout after {retries} attempts for {url}"
+        except requests.exceptions.TooManyRedirects:
+            return f"Error: Too many redirects for {url}"
+        except requests.exceptions.ConnectionError:
+            logging.warning(f"Connection error on attempt {attempt+1} for {url}")
+            if attempt == retries - 1:
+                return f"Error: Could not connect to {url} after {retries} attempts"
+        except Exception as e:
+            logging.error(f"Error retrieving content from {url} (attempt {attempt+1}): {e}")
+            if attempt == retries - 1:
+                return f"Error accessing URL: {str(e)}"
+    return f"Content could not be retrieved from {url} after {retries} attempts"
 def clean_content(raw_content: str) -> str:
     # Parse HTML using BeautifulSoup (if not HTML, it will safely return the text)
     soup = BeautifulSoup(raw_content, "html.parser")
         snippet = summarize_large_text(snippet, target_length=2000, chunk_size=1000, overlap=200)
         snippet_words = len(snippet.split())
     client = os.getenv('OPENAI_API_KEY')  # alternatively, pass your API key here if needed.
     prompt = (f"""Analyze the following content from a query result:
     -   Key Facts (at least 5): List the core factual claims using short, declarative sentences or bullet points. Apply lemmatization and standard abbreviations.
     -   Key Figures (at least 5): Extract numerical data (statistics, dates, percentages) and include any necessary context (units, references, explanations) required to interpret these numbers. Present them concisely (list or table format).
     -   Key Arguments (at least 5): Identify main arguments or claims. Summarize supporting evidence and counter-arguments concisely.
+    -   Key Quotes (at least 1 if any): Include significant quotes (with the author's name in parentheses). Attribute quotes correctly. Paraphrase if needed, indicating that it's a paraphrase. Use symbols (e.g., &, +, ->, =) to conserve tokens.
     -   Structured Summary (10 to 50 sentences): Provide a structured summary that includes anecdotes, people, and locations to ensure the report is relatable.
 Note: General Optimization Guidelines:
     -   Shorten words carefully (e.g., "information" -> "info") without causing ambiguity.
     -   Use symbols where appropriate.
+3.  Follow-up Search Queries: Generate at least {breadth} follow-up search queries relevant to the research topic and the summarized content. Use search operators (AND, OR, quotation marks) as needed.
 4. Ensure that the summary length and level of detail is proportional to the source length.
 Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
+IMPORTANT: Format your response as a proper JSON object with these fields:
+- "relevant": "yes" or "no"
+- "summary": {...your structured summary with all parts...}
+- "followups": [array of follow-up queries]
+"""
     )
     try:
         response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
+        if not response:
             logging.error("analyze_with_gpt4o: Empty response received from API.")
             return {"relevant": "no", "summary": "", "followups": []}
+        # Check if the response already begins with "yes" or "no" (non-JSON format)
+        if response.strip().lower().startswith("yes") or response.strip().lower().startswith("no"):
+            # Handle non-JSON format
+            lines = response.strip().split("\n")
+            relevance = "yes" if lines[0].strip().lower() == "yes" else "no"
+            # Extract the follow-up queries (usually in brackets at the end)
+            followups_match = re.search(r'\[(.*?)\]', response, re.DOTALL)
+            followups = []
+            if followups_match:
+                followups_text = followups_match.group(1)
+                # Parse the lines within brackets as comma-separated or quoted items
+                followups = [q.strip().strip('"\'') for q in re.findall(r'"([^"]*)"', followups_text)]
+                if not followups:  # Try without quotes
+                    followups = [q.strip() for q in followups_text.split(",")]
+            # Everything else is the summary
+            summary_text = response
+            if followups_match:
+                summary_text = response[:followups_match.start()].strip()
+            if summary_text.startswith("yes") or summary_text.startswith("no"):
+                summary_text = "\n".join(lines[1:]).strip()
+            return {
+                "relevant": relevance,
+                "summary": summary_text,
+                "followups": followups if followups else []
+            }
+        else:
+            # Standard JSON parsing
+            # Remove Markdown code fences if present
+            if response.startswith("```"):
+                response = re.sub(r"^```(json)?", "", response)
+                response = re.sub(r"```$", "", response).strip()
+            response = response.strip()
+            # Optionally remove any start/end markers like "json" if present:
+            if response.lower().startswith("json"):
+                response = response[4:].strip()
+            try:
+                result = json.loads(response)
+                return result
+            except json.JSONDecodeError:
+                # If JSON parsing fails, try to extract the information using regex
+                logging.warning("JSON parsing failed, attempting regex extraction")
+                relevance_match = re.search(r'"relevant":\s*"(yes|no)"', response, re.IGNORECASE)
+                relevance = relevance_match.group(1) if relevance_match else "no"
+                # Extract follow-up queries using regex
+                followups_match = re.search(r'"followups":\s*\[(.*?)\]', response, re.DOTALL)
+                followups = []
+                if followups_match:
+                    followups_text = followups_match.group(1)
+                    followups = [q.strip().strip('"\'') for q in re.findall(r'"([^"]*)"', followups_text)]
+                # Extract summary (everything else)
+                summary = response
+                return {
+                    "relevant": relevance,
+                    "summary": summary,
+                    "followups": followups
+                }
     except Exception as e:
         logging.error(f"analyze_with_gpt4o error: {e}")
         return {"relevant": "no", "summary": "", "followups": []}
                 title = res.get("title", "No Title")
                 if not url:
                     continue
+                raw_content = process_url(url)
+                if raw_content.startswith("Error"):
+                    process_log += f"{raw_content}\n"
+                    continue
                 # Skip processing if raw_content is empty or too short (< 1000 characters)
+                if not raw_content or len(raw_content) < 1000 or "could not be extracted" in raw_content.lower():
                     process_log += f"Content from {url} is either an error or too short (<1000 characters), skipping.\n"
                     continue
+                process_log += f"Successfully extracted content from {url}\n"
                 # 1) Clean and do minimal parse
                 cleaned_html = clean_content(raw_content)
                 # 2) Extract structured data