Spaces:

10gen
/

deepsearchitv2

Runtime error

App Files Files Community

Guiyom commited on Mar 12, 2025

Commit

01a84ee

verified ·

1 Parent(s): 01d7266

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -156

app.py CHANGED Viewed

@@ -21,6 +21,66 @@ TOTAL_SUMMARIZED_WORDS = 0
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # ============================================================================= Helper functions
 def process_pdf(url: str) -> str:
     try:
         headers = {"User-Agent": get_random_header()}
@@ -2901,6 +2961,7 @@ class ReportGenerator:
         solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
         solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
         html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
         html_content = html_content.replace("<h2>Table of Contents</h2>", "<div class='page-break'></div><h2>Table of Contents</h2>")
         html_content = html_content.replace("<h2>Introduction</h2>", "<div class='page-break'></div><h2>Introduction</h2>")
         html_content = html_content.replace("<h2>Conclusion</h2>", "<div class='page-break'></div><h2>Conclusion</h2>")
@@ -2911,80 +2972,29 @@ class ReportGenerator:
         return html_content
     def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
-        # Generate the full HTML report (including text, focus placeholders, and mermaid visuals as iframes)
         html_report = self.generate_report_html(solution_content)
-        # Add header
         date_str = datetime.now().strftime("%Y-%m-%d")
         header = ""
         if metadata:
-            header = f"<p>Search Query: {metadata.get('Query name', 'N/A')}<br>Author: {metadata.get('User name', 'N/A')} | Date: {metadata.get('Date', date_str)}</p>"
         soup = BeautifulSoup(html_report, "html.parser")
-        body_tag = soup.body
-        if body_tag:
-            body_tag.insert(0, BeautifulSoup(header, "html.parser"))
-        updated_html = str(soup)
-        # Parse the HTML
-        logging.info(f"ReportGenerator: soup report generated:\n{soup}")
-        # Find all mermaid visual iframes (assumed to have class "visual-frame")
-        visual_iframes = soup.find_all("iframe", class_="visual-frame")
-        if visual_iframes:
-            # Set up Selenium with a window size and high DPI for better image resolution
-            import base64, tempfile, time
-            import chromedriver_autoinstaller
-            chromedriver_autoinstaller.install()
-            # (Removed the explicit print statement to keep logs clean)
-            from selenium import webdriver
-            from selenium.webdriver.chrome.options import Options
-            from selenium.webdriver.chrome.service import Service
-            options = Options()
-            options.add_argument("--headless")
-            options.add_argument("--no-sandbox")
-            options.add_argument("--disable-dev-shm-usage")
-            options.add_argument("--window-size=1200,1200")
-            options.add_argument("--force-device-scale-factor=2")
-            service = Service(log_path=os.devnull)
-            driver = webdriver.Chrome(service=service, options=options)
-            for iframe in visual_iframes:
-                # Assume the iframe has its content in srcdoc (as generated in generate_visual_snippet)
-                srcdoc = iframe.get("srcdoc")
-                if srcdoc:
-                    with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_file:
-                        tmp_file.write(srcdoc.encode("utf-8"))
-                        tmp_file.flush()
-                        file_url = "file://" + tmp_file.name
-                    driver.get(file_url)
-                    time.sleep(3)  # Allow time for JavaScript (e.g., mermaid) to render
-                    screenshot_png = driver.get_screenshot_as_png()
-                    # Optional: Crop the screenshot to remove extra whitespace:
-                    from PIL import Image
-                    from io import BytesIO
-                    img = Image.open(BytesIO(screenshot_png))
-                    cropped_img = img.crop(img.getbbox())
-                    buffer = BytesIO()
-                    cropped_img.save(buffer, format='PNG')
-                    cropped_png = buffer.getvalue()
-                    b64_img = base64.b64encode(cropped_png).decode("utf-8")
-                    new_tag = soup.new_tag("img")
-                    # Add page-break style to avoid forcing a new page after the image when generating the PDF.
-                    new_tag["style"] = "max-width: 500px; display: block; margin: auto; page-break-after: avoid;"
-                    new_tag["src"] = "data:image/png;base64," + b64_img
-                    iframe.replace_with(new_tag)
-            driver.quit()
-        # Instead of converting the entire soup (which may include nested <html> tags), extract only the content within <body>
         body_tag = soup.find("body")
-        body_content = body_tag.decode_contents() if body_tag else ""
-        # Reassemble a clean, single HTML document with our desired CSS (preserving line breaks)
-        final_html = f"""
 <html>
   <head>
     <meta charset="utf-8" />
@@ -3011,101 +3021,16 @@ class ReportGenerator:
   </body>
 </html>
 """
-        # Preprocessing for log display
-        def remove_img_tags(input_string):
-            # Regex pattern to match <img> tags with any src attribute
-            pattern = r'<img src=.*?>'
-            # Replace all occurrences with an empty string
-            cleaned_string = re.sub(pattern, '', input_string, flags=re.MULTILINE)
-            return cleaned_string
-        cleaned_string = remove_img_tags(final_html)
-        logging.info(f"ReportGenerator: Final HTML for PDF conversion:\n{cleaned_string}")
-        # Crafting compliance
-        final_html = final_html.replace("<h1","<br><br><br><h1").replace("</h1>","</h1><br>")
-        final_html = final_html.replace("<h2","<br><br><b><h2").replace("</h2>","</b></h2><br>")
-        final_html = final_html.replace("<h3","<br><br><h3").replace("</h3>","</b></h3><br>")
-        final_html = final_html.replace("<h4","<br><h4")
-        final_html = final_html.replace("<div","<br><div")
-        final_html = final_html.replace("<table>","<br><table>")
-        # Generate the final PDF from final_html using xhtml2pdf (A4 layout)
         pdf_buffer = io.BytesIO()
         pisa_status = pisa.CreatePDF(final_html, dest=pdf_buffer,
-                                     link_callback=lambda uri, rel: uri,
-                                     default_css="""
-@page {
-  size: A4;
-  margin: 0.5in;
-}
-body {
-  font-family: Helvetica, sans-serif;
-  background: white;
-  margin: 40px;
-  padding: 0;
-}
-h1 {
-  font-size: 20pt;
-  margin-bottom: 12px;
-  text-align: left;
-  font-weight: bold;
-}
-h2 {
-  font-size: 16pt;
-  margin-bottom: 10px;
-  text-align: left;
-  font-weight: bold;
-}
-h3 {
-  font-size: 14pt;
-  margin-bottom: 8px;
-  text-align: left;
-  font-weight: bold;
-}
-h4 {
-  font-size: 12pt;
-  text-align: left;
-  font-weight: bold;
-}
-table {
-  border: 1px solid black;
-}
-p {
-  font-size: 11pt;
-  line-height: 1.5;
-  margin-bottom: 10px;
-}
-pre, div {
-}
-ol, ul {
-  font-size: 11pt;
-  margin-left: 20px;
-  line-height: 1.5;
-}
-hr {
-  border: 1px solid #ccc;
-  margin: 20px 0;
-}
-table {
-  border-collapse: collapse;
-  width: 100%;
-  margin-bottom: 10px;
-}
-th, td {
-  border: 1px solid #ccc;
-  padding: 8px;
-  text-align: left;
-}
-th {
-  background-color: #f2f2f2;
-}
-.page-break {
-  page-break-before: always;
-}
-""")
         if pisa_status.err:
             logging.error("Error generating PDF with xhtml2pdf.")
             return None
         return pdf_buffer.getvalue()
 def handle_generate_report(query_name: str, user_name: str, final_report: str):

 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # ============================================================================= Helper functions
+def capture_visual_screenshot(srcdoc: str) -> str:
+    """
+    Opens a temporary HTML file from the provided srcdoc string,
+    loads it in a headless Chrome browser using Selenium,
+    waits for the content to render,
+    takes a screenshot, crops it, and returns a base64-encoded PNG image.
+    """
+    options = Options()
+    options.add_argument("--headless")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    driver = None
+    try:
+        driver = webdriver.Chrome(options=options)
+        driver.set_window_size(1080, 720)  # Adjust per your expected visual dimensions
+        # Write the srcdoc to a temporary HTML file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_file:
+            tmp_file.write(srcdoc.encode("utf-8"))
+            tmp_file.flush()
+            file_url = "file://" + tmp_file.name
+        driver.get(file_url)
+        time.sleep(3)  # Allow time for JavaScript (e.g., mermaid) to render
+        screenshot_png = driver.get_screenshot_as_png()
+        image = Image.open(BytesIO(screenshot_png))
+        # Crop the image to remove extra whitespace
+        cropped_image = image.crop(image.getbbox())
+        buffered = BytesIO()
+        cropped_image.save(buffered, format='PNG')
+        base64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return base64_img
+    except WebDriverException as e:
+        logging.error("Error in capture_visual_screenshot: %s", e)
+        return ""
+    finally:
+        if driver:
+            driver.quit()
+def replace_visual_iframes(soup: BeautifulSoup) -> BeautifulSoup:
+    """
+    Finds all <iframe class="visual-frame"> tags in the provided BeautifulSoup object,
+    uses capture_visual_screenshot() to get a base64 image,
+    and replaces each iframe with an <img> tag embedding the screenshot.
+    """
+    iframes = soup.find_all("iframe", class_="visual-frame")
+    for iframe in iframes:
+        srcdoc = iframe.get("srcdoc")
+        if srcdoc:
+            base64_img = capture_visual_screenshot(srcdoc)
+            if base64_img:
+                new_img = soup.new_tag("img")
+                new_img["src"] = "data:image/png;base64," + base64_img
+                new_img["style"] = "max-width:100%; display:block; margin:auto; page-break-after:avoid;"
+                iframe.replace_with(new_img)
+            else:
+                logging.error("Failed to capture screenshot for an iframe.")
+    return soup
 def process_pdf(url: str) -> str:
     try:
         headers = {"User-Agent": get_random_header()}
         solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
         solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
         html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
+        # Insert page break divs before key sections
         html_content = html_content.replace("<h2>Table of Contents</h2>", "<div class='page-break'></div><h2>Table of Contents</h2>")
         html_content = html_content.replace("<h2>Introduction</h2>", "<div class='page-break'></div><h2>Introduction</h2>")
         html_content = html_content.replace("<h2>Conclusion</h2>", "<div class='page-break'></div><h2>Conclusion</h2>")
         return html_content
     def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
+        # Generate the full HTML report (including text, placeholders, and mermaid visuals as iframes)
         html_report = self.generate_report_html(solution_content)
+        # Add header if provided in metadata
         date_str = datetime.now().strftime("%Y-%m-%d")
         header = ""
         if metadata:
+            header = (f"<p>Search Query: {metadata.get('Query name', 'N/A')}<br>"
+                      f"Author: {metadata.get('User name', 'N/A')} | Date: {metadata.get('Date', date_str)}</p>")
         soup = BeautifulSoup(html_report, "html.parser")
+        if soup.body:
+            soup.body.insert(0, BeautifulSoup(header, "html.parser"))
+        logging.info("ReportGenerator: Soup report generated:\n%s", soup)
+        # Replace all iframes (class 'visual-frame') with images
+        soup = replace_visual_iframes(soup)
+        # Extract only the body content if <body> exists, otherwise use full HTML
         body_tag = soup.find("body")
+        body_content = body_tag.decode_contents() if body_tag else str(soup)
+        # Reassemble a clean HTML document with inline CSS styles for PDF conversion.
+        final_html = f"""<!DOCTYPE html>
 <html>
   <head>
     <meta charset="utf-8" />
   </body>
 </html>
 """
+        logging.info("ReportGenerator: Final HTML for PDF conversion generated.")
+        # Generate the PDF using xhtml2pdf (pisa)
         pdf_buffer = io.BytesIO()
         pisa_status = pisa.CreatePDF(final_html, dest=pdf_buffer,
+                                     link_callback=lambda uri, rel: uri)
         if pisa_status.err:
             logging.error("Error generating PDF with xhtml2pdf.")
             return None
+        logging.info("ReportGenerator: PDF generated successfully.")
         return pdf_buffer.getvalue()
 def handle_generate_report(query_name: str, user_name: str, final_report: str):