Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Mar 10, 2025

Commit

6986d5c

verified ·

1 Parent(s): 9a02e15

Update app.py

Browse files

Files changed (1) hide show

app.py +210 -103

app.py CHANGED Viewed

@@ -2892,10 +2892,10 @@ def validate_visual_html(html: str) -> bool:
 class ReportGenerator:
     def __init__(self, render_with_selenium: bool = False):
-        # Flag to determine if we are rendering the final PDF with Selenium
         self.render_with_selenium = render_with_selenium
-    def generate_report_html(self, solution_content: str, metadata: dict = None) -> str:
         # Normalize text and fix dash characters.
         solution_content = unicodedata.normalize('NFKC', solution_content)
         solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
@@ -2907,118 +2907,205 @@ class ReportGenerator:
         html_content = html_content.replace("<h2>References</h2>", "<div class='page-break'></div><h2>References</h2>")
         html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>", "<div class='page-break'></div><h2>Surprise-Me Extension Report</h2>")
         date_str = datetime.now().strftime("%Y-%m-%d")
         header = ""
         if metadata:
-            header = f"""<h1>Search Query: {metadata.get('Query name', 'N/A')}</h1>
-<p>Author: {metadata.get('User name', 'N/A')}</p>
-<p>Date: {metadata.get('Date', date_str)}</p>
-<hr/>"""
-        # Force a white background for the entire page (overriding any light grey)
-        full_html = f"""
 <html>
-<head>
     <meta charset="utf-8" />
     <style>
-      body {{ font-family: Helvetica, sans-serif; margin: 40px; background: white; }}
-      h1 {{ font-size: 24pt; margin-bottom: 12px; text-align: left; }}
-      h2 {{ font-size: 20pt; margin-bottom: 10px; text-align: left; }}
-      h3 {{ font-size: 18pt; margin-bottom: 8px; text-align: left; }}
-      /* Force paragraphs to have a white background */
-      p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; background-color: white !important; }}
-      ol, ul {{ font-size: 11pt; margin-left: 20px; line-height: 1.5; }}
-      h2 + ol {{ line-height: 1 !important; }}
-      hr {{ border: 1px solid #ccc; margin: 20px 0; }}
-      table {{ border-collapse: collapse; width: 100%; margin-bottom: 10px; }}
-      th, td {{ border: 1px solid #ccc; padding: 8px; text-align: left; }}
-      th {{ background-color: #f2f2f2; }}
-      .page-break {{ page-break-before: always; }}
     </style>
-    <script>
-        console.log("Report loaded successfully.");
-    </script>
-</head>
-<body>
-    {header}
-    {html_content}
-</body>
 </html>
 """
-        logging.info("ReportGenerator: HTML report generated successfully.")
-        return full_html
-    def fallback_pdf_generation(self, html_content: str) -> bytes:
-        from selenium import webdriver
-        from selenium.webdriver.chrome.options import Options
-        import time
-        import io
-        options = Options()
-        options.add_argument("--headless")
-        options.add_argument("--disable-gpu")
-        options.add_argument("--no-sandbox")
-        options.add_argument("--window-size=1920,1080")
-        driver = webdriver.Chrome(options=options)
-        try:
-            driver.get(f"data:text/html;charset=utf-8,{html_content}")
-            time.sleep(10)
-            screenshot_png = driver.get_screenshot_as_png()
-            return screenshot_png
-        finally:
-            driver.quit()
-    def generate_pdf_with_mermaid(self, html_report: str) -> bytes:
-        """
-        This method processes the given HTML report, extracts all mermaid diagrams
-        (assumed to be rendered as SVG inside a <div class="mermaid">), and uses
-        svg-to-PDFKit to embed each diagram into a PDF document.
-        """
-        from pdfkit import PDFDocument  # hypothetical module import
-        from blobstream import BlobStream   # hypothetical; adapt as needed
-        from svg2pdfkit import SVGtoPDF     # pseudo-import; adjust to your environment
-        # Create a new PDF document using PDFKit's PDFDocument (or equivalent)
-        doc = PDFDocument({ "compress": True })
-        # Keep track of Y-offset for inserting each diagram
-        current_y = doc.y if hasattr(doc, "y") else 0
-        # Parse the HTML to extract mermaid diagrams
-        soup = BeautifulSoup(html_report, "html.parser")
-        mermaid_elements = soup.find_all("div", class_="mermaid")
-        for mermaid in mermaid_elements:
-            # Extract the SVG content. (Assumes your mermaid container produces SVG markup.)
-            svg_content = mermaid.decode_contents()
-            # Here, you may want to clean the SVG (remove any extra tags).
-            svg_content = re.sub(r"<\/?(html|head|body)[^>]*>", "", svg_content, flags=re.DOTALL|re.IGNORECASE).strip()
-            # Use SVGtoPDF to embed the diagram into the PDF.
-            # The x and y coordinates are provided (current_x is 0; update current_y as needed)
-            SVGtoPDF(doc, svg_content, 0, current_y, { "useCSS": True })
-            # Advance the y-offset by an amount (e.g., 500 points) for the next diagram
-            current_y += 500
-        # Finalize the PDF document
-        doc.end()
-        stream = doc.pipe(BlobStream())
-        # Here you would normally wait for the "finish" event, but for simplicity assume synchronous behavior:
-        pdf_bytes = stream.getvalue()  # Adjust as needed to extract the PDF byte content
-        return pdf_bytes
-    def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
-        html_report = self.generate_report_html(solution_content, metadata)
-        # Check if the report contains mermaid diagrams
-        if 'class="mermaid"' in html_report:
-            try:
-                pdf_bytes = self.generate_pdf_with_mermaid(html_report)
-                return pdf_bytes
-            except Exception as e:
-                logging.error("Error in mermaid PDF generation, falling back to xhtml2pdf: " + str(e))
-        # Otherwise, use your existing pipeline (using xhtml2pdf, Selenium fallback, etc.)
-        import io
         pdf_buffer = io.BytesIO()
-        pisa_status = pisa.CreatePDF(html_report, dest=pdf_buffer)
         if pisa_status.err:
-            return self.fallback_pdf_generation(html_report)
         return pdf_buffer.getvalue()
 def handle_generate_report(query_name: str, user_name: str, final_report: str):
@@ -3041,6 +3128,26 @@ def handle_generate_report(query_name: str, user_name: str, final_report: str):
         logging.error(f"handle_generate_report error: {e}", exc_info=True)
         return f"Error generating report: {str(e)}", None
 def extract_summary_from_crumbs(crumbs_list: list) -> str:
     """
     Given a list of crumb records (each with 'url', 'summary', and 'full_content'),

 class ReportGenerator:
     def __init__(self, render_with_selenium: bool = False):
+        # Flag to determine if we are rendering the final PDF using Selenium
         self.render_with_selenium = render_with_selenium
+    def generate_report_html(self, solution_content: str) -> str:
         # Normalize text and fix dash characters.
         solution_content = unicodedata.normalize('NFKC', solution_content)
         solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
         html_content = html_content.replace("<h2>References</h2>", "<div class='page-break'></div><h2>References</h2>")
         html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>", "<div class='page-break'></div><h2>Surprise-Me Extension Report</h2>")
+        logging.info(f"ReportGenerator: HTML report generated successfully:\n{html_content}")
+        return html_content
+    def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
+        # Generate the full HTML report (including text, focus placeholders, and visuals as iframes)
+        html_report = self.generate_report_html(solution_content)
+        # Add header
         date_str = datetime.now().strftime("%Y-%m-%d")
         header = ""
         if metadata:
+            header = f"<p>Search Query: {metadata.get('Query name', 'N/A')}<br>Author: {metadata.get('User name', 'N/A')} | Date: {metadata.get('Date', date_str)}</p>"
+        soup = BeautifulSoup(html_report, "html.parser")
+        body_tag = soup.body
+        if body_tag:
+            body_tag.insert(0, BeautifulSoup(header, "html.parser"))
+        updated_html = str(soup)
+        # Parse the HTML
+        logging.info(f"ReportGenerator: soup report generated:\n{soup}")
+        # Find all mermaid visual iframes (assumed to have class "visual-frame")
+        visual_iframes = soup.find_all("iframe", class_="visual-frame")
+        if visual_iframes:
+            # Set up Selenium with a window size and high DPI for better image resolution
+            import base64, tempfile, time
+            import chromedriver_autoinstaller
+            chromedriver_autoinstaller.install()
+            # (Removed the explicit print statement to keep logs clean)
+            from selenium import webdriver
+            from selenium.webdriver.chrome.options import Options
+            from selenium.webdriver.chrome.service import Service
+            options = Options()
+            options.add_argument("--headless")
+            options.add_argument("--no-sandbox")
+            options.add_argument("--disable-dev-shm-usage")
+            options.add_argument("--window-size=1200,1200")
+            options.add_argument("--force-device-scale-factor=2")
+            service = Service(log_path=os.devnull)
+            driver = webdriver.Chrome(service=service, options=options)
+            for iframe in visual_iframes:
+                # Assume the iframe has its content in srcdoc (as generated in generate_visual_snippet)
+                srcdoc = iframe.get("srcdoc")
+                if srcdoc:
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_file:
+                        tmp_file.write(srcdoc.encode("utf-8"))
+                        tmp_file.flush()
+                        file_url = "file://" + tmp_file.name
+                    driver.get(file_url)
+                    time.sleep(3)  # Allow time for JavaScript (e.g., mermaid) to render
+                    screenshot_png = driver.get_screenshot_as_png()
+                    # Optional: Crop the screenshot to remove extra whitespace:
+                    from PIL import Image
+                    from io import BytesIO
+                    img = Image.open(BytesIO(screenshot_png))
+                    cropped_img = img.crop(img.getbbox())
+                    buffer = BytesIO()
+                    cropped_img.save(buffer, format='PNG')
+                    cropped_png = buffer.getvalue()
+                    b64_img = base64.b64encode(cropped_png).decode("utf-8")
+                    new_tag = soup.new_tag("img")
+                    # Add page-break style to avoid forcing a new page after the image when generating the PDF.
+                    new_tag["style"] = "max-width: 500px; display: block; margin: auto; page-break-after: avoid;"
+                    new_tag["src"] = "data:image/png;base64," + b64_img
+                    iframe.replace_with(new_tag)
+            driver.quit()
+        # Instead of converting the entire soup (which may include nested <html> tags), extract only the content within <body>
+        body_tag = soup.find("body")
+        body_content = body_tag.decode_contents() if body_tag else ""
+        # Reassemble a clean, single HTML document with our desired CSS (preserving line breaks)
+        final_html = f"""
 <html>
+  <head>
     <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <style>
+        body {{ font-family: Helvetica, sans-serif; margin: 40px; background: white; }}
+        h1 {{ font-size: 20pt; margin-bottom: 12px; text-align: left; font-weight: bold;}}
+        h2 {{ font-size: 16pt; margin-bottom: 10px; text-align: left; font-weight: bold;}}
+        h3 {{ font-size: 14pt; margin-bottom: 8px; text-align: left; font-weight: bold;}}
+        h4 {{ font-size: 12pt; text-align: left; font-weight: bold;}}
+        p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; white-space: pre-wrap; }}
+        table {{ border: 1px solid black; }}
+        pre, div {{ white-space: pre-wrap; }}
+        ol, ul {{ font-size: 11pt; margin-left: 20px; line-height: 1.5; }}
+        hr {{ border: 1px solid #ccc; margin: 20px 0; }}
+        table {{ border-collapse: collapse; width: 100%; margin-bottom: 10px; }}
+        th, td {{ border: 1px solid #ccc; padding: 8px; text-align: left; }}
+        th {{ background-color: #f2f2f2; }}
+        .page-break {{ page-break-before: always; }}
     </style>
+  </head>
+  <body>
+    {body_content}
+  </body>
 </html>
 """
+        # Preprocessing for log display
+        def remove_img_tags(input_string):
+            # Regex pattern to match <img> tags with any src attribute
+            pattern = r'<img src=.*?>'
+            # Replace all occurrences with an empty string
+            cleaned_string = re.sub(pattern, '', input_string, flags=re.MULTILINE)
+            return cleaned_string
+        cleaned_string = remove_img_tags(final_html)
+        logging.info(f"ReportGenerator: Final HTML for PDF conversion:\n{cleaned_string}")
+        # Crafting compliance
+        final_html = final_html.replace("<h1","<br><br><br><h1").replace("</h1>","</h1><br>")
+        final_html = final_html.replace("<h2","<br><br><b><h2").replace("</h2>","</b></h2><br>")
+        final_html = final_html.replace("<h3","<br><br><h3").replace("</h3>","</b></h3><br>")
+        final_html = final_html.replace("<h4","<br><h4")
+        final_html = final_html.replace("<div","<br><div")
+        final_html = final_html.replace("<table>","<br><table>")
+        # Generate the final PDF from final_html using xhtml2pdf (A4 layout)
         pdf_buffer = io.BytesIO()
+        pisa_status = pisa.CreatePDF(final_html, dest=pdf_buffer,
+                                     link_callback=lambda uri, rel: uri,
+                                     default_css="""
+@page {
+  size: A4;
+  margin: 0.5in;
+}
+body {
+  font-family: Helvetica, sans-serif;
+  background: white;
+  margin: 40px;
+  padding: 0;
+}
+h1 {
+  font-size: 20pt;
+  margin-bottom: 12px;
+  text-align: left;
+  font-weight: bold;
+}
+h2 {
+  font-size: 16pt;
+  margin-bottom: 10px;
+  text-align: left;
+  font-weight: bold;
+}
+h3 {
+  font-size: 14pt;
+  margin-bottom: 8px;
+  text-align: left;
+  font-weight: bold;
+}
+h4 {
+  font-size: 12pt;
+  text-align: left;
+  font-weight: bold;
+}
+table {
+  border: 1px solid black;
+}
+p {
+  font-size: 11pt;
+  line-height: 1.5;
+  margin-bottom: 10px;
+}
+pre, div {
+}
+ol, ul {
+  font-size: 11pt;
+  margin-left: 20px;
+  line-height: 1.5;
+}
+hr {
+  border: 1px solid #ccc;
+  margin: 20px 0;
+}
+table {
+  border-collapse: collapse;
+  width: 100%;
+  margin-bottom: 10px;
+}
+th, td {
+  border: 1px solid #ccc;
+  padding: 8px;
+  text-align: left;
+}
+th {
+  background-color: #f2f2f2;
+}
+.page-break {
+  page-break-before: always;
+}
+""")
         if pisa_status.err:
+            logging.error("Error generating PDF with xhtml2pdf.")
+            return None
         return pdf_buffer.getvalue()
 def handle_generate_report(query_name: str, user_name: str, final_report: str):
         logging.error(f"handle_generate_report error: {e}", exc_info=True)
         return f"Error generating report: {str(e)}", None
+def handle_generate_report(query_name: str, user_name: str, final_report: str):
+    try:
+        report_generator = ReportGenerator(render_with_selenium=False)
+        metadata = {
+            "Query name": query_name,
+            "User name": user_name,
+            "Date": datetime.now().strftime("%Y-%m-%d"),
+            "Time": datetime.now().strftime("%H:%M:%S"),
+        }
+        pdf_bytes = report_generator.generate_report_pdf(solution_content=final_report, metadata=metadata)
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+            tmp_file.write(pdf_bytes)
+            tmp_path = tmp_file.name
+        logging.info(f"handle_generate_report: PDF report generated at {tmp_path}")
+        return "Report generated successfully.", gr.update(value=tmp_path, visible=True)
+    except Exception as e:
+        logging.error(f"handle_generate_report error: {e}", exc_info=True)
+        return f"Error generating report: {str(e)}", None
 def extract_summary_from_crumbs(crumbs_list: list) -> str:
     """
     Given a list of crumb records (each with 'url', 'summary', and 'full_content'),