Spaces:

10gen
/

deepsearchitv2

Runtime error

App Files Files Community

Guiyom commited on Feb 16, 2025

Commit

39895bb

verified ·

1 Parent(s): 0417aa5

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -75

app.py CHANGED Viewed

@@ -17,8 +17,6 @@ from datetime import datetime
 from reportlab.lib.pagesizes import A4
 from xhtml2pdf import pisa
 import chromedriver_autoinstaller
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
 # Set up logging basic configuration
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -32,27 +30,20 @@ MAX_MESSAGE_LENGTH = 1048576
 # Helper functions for external APIs and PDF Processing
 # =============================================================================
-def fallback_pdf_generation(html_content: str) -> bytes:
-    """Convert HTML to PDF using a screenshot fallback via Selenium."""
-    # Automatically install ChromeDriver if not found
-    chromedriver_autoinstaller.install()
-    options = Options()
-    options.add_argument("--headless")
-    options.add_argument("--disable-gpu")
-    options.add_argument("--no-sandbox")
-    options.add_argument("--window-size=1920,1080")  # Desired window size
-    # Initialize the WebDriver which now uses the auto-installed ChromeDriver
-    driver = webdriver.Chrome(options=options)
-    try:
-        # Load the HTML content directly using a data URL.
-        driver.get(f"data:text/html;charset=utf-8,{html_content}")
-        time.sleep(2)  # Allow time for dynamic content to render
-        screenshot_png = driver.get_screenshot_as_png()
-        return screenshot_png
-    finally:
-        driver.quit()
 def generate_visual_snippet(placeholder_text: str, context: str, initial_query: str, crumbs: str) -> str:
     prompt = (f"""
@@ -74,6 +65,7 @@ Keep in mind the:
 - White background (#ffffff)
 - overall dimension capped at 500px x 500px
 - no introduction, conclusions or code fences -> Output the result directly
 // Important
 - Make the visuals content rich, there's no point having a visual if its content has no real value.
@@ -822,25 +814,22 @@ def validate_visual_html(html: str) -> bool:
     return all(checks)
 class ReportGenerator:
-    def __init__(self):
-        pass
     def generate_report_html(self, solution_content: str, metadata: dict = None) -> str:
-        # Normalize text and replace problematic dash characters with standard hyphen.
         solution_content = unicodedata.normalize('NFKC', solution_content)
         solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
-        # Remove markdown hyperlink syntax: replace [text](link) with just text.
         solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
-        # Convert markdown to HTML using the "extra" and "tables" extensions to support numbering and table syntax.
         html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
-        # Insert explicit breaks for main report sections as needed.
         html_content = html_content.replace("<h2>Table of Contents</h2>", "<div class='page-break'></div><h2>Table of Contents</h2>")
         html_content = html_content.replace("<h2>Introduction</h2>", "<div class='page-break'></div><h2>Introduction</h2>")
         html_content = html_content.replace("<h2>Conclusion</h2>", "<div class='page-break'></div><h2>Conclusion</h2>")
         html_content = html_content.replace("<h2>References</h2>", "<div class='page-break'></div><h2>References</h2>")
         html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>", "<div class='page-break'></div><h2>Surprise-Me Extension Report</h2>")
-        # Build header using metadata if provided.
         date_str = datetime.now().strftime("%Y-%m-%d")
         header = ""
         if metadata:
@@ -848,13 +837,13 @@ class ReportGenerator:
 <p>Author: {metadata.get('User name', 'N/A')}</p>
 <p>Date: {metadata.get('Date', date_str)}</p>
 <hr/>"""
-        # Build a complete HTML document with CSS and JS (if needed)
         full_html = f"""
 <html>
 <head>
     <meta charset="utf-8" />
     <style>
-        body {{ font-family: Helvetica, sans-serif; margin: 40px; }}
         h1 {{ font-size: 24pt; margin-bottom: 12px; text-align: left; }}
         h2 {{ font-size: 20pt; margin-bottom: 10px; text-align: left; }}
         h3 {{ font-size: 18pt; margin-bottom: 8px; text-align: left; }}
@@ -867,7 +856,6 @@ class ReportGenerator:
         .page-break {{ page-break-before: always; }}
     </style>
     <script>
-        // You may add your JavaScript here if needed.
         console.log("Report loaded successfully.");
     </script>
 </head>
@@ -891,64 +879,69 @@ class ReportGenerator:
         options.add_argument("--headless")
         options.add_argument("--disable-gpu")
         options.add_argument("--no-sandbox")
-        options.add_argument("--window-size=1920,1080")  # Set desired window size
-        # Ensure you have ChromeDriver installed and in your PATH.
         driver = webdriver.Chrome(options=options)
         try:
-            # Load the HTML content directly using a data URL.
             driver.get(f"data:text/html;charset=utf-8,{html_content}")
-            time.sleep(2)  # Allow time for any dynamic content to render
             screenshot_png = driver.get_screenshot_as_png()
             return screenshot_png
         finally:
-            driver.quit()
     def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
         # Generate the full HTML report
         html_report = self.generate_report_html(solution_content, metadata)
-        # Pre-process the HTML: replace dynamic mermaid iframes with static placeholders.
-        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(html_report, "html.parser")
-        for iframe in soup.find_all("iframe"):
-            srcdoc = iframe.get("srcdoc", "")
-            if "mermaid" in srcdoc:
-                # Create a static placeholder div to indicate a mermaid diagram here.
-                placeholder = soup.new_tag("div", **{"class": "mermaid-placeholder"})
-                placeholder.string = "Mermaid diagram placeholder (not rendered in PDF)"
-                iframe.replace_with(placeholder)
-        # Convert the modified soup back into a string
-        html_report = str(soup)
-        # Inject PDF-specific CSS with support for the placeholder.
-        html_report = html_report.replace("<style>", """<style>
-        @media print {
-             .mermaid-placeholder {
-                width: 500px;
-                height: 500px;
-                border: 1px solid #ccc;
-                display: flex;
-                align-items: center;
-                justify-content: center;
-                font-size: 12pt;
-                color: #666;
-                margin: 10px auto;
-             }
-             .visual-container { page-break-inside: avoid; }
-             svg { max-width: 100% !important; height: auto !important; }
-        }
-        """)
-        # Convert to PDF using xhtml2pdf (pisa)
         pdf_buffer = io.BytesIO()
         pisa_status = pisa.CreatePDF(html_report, dest=pdf_buffer)
-        # If errors are found during PDF conversion, use the fallback method.
         if pisa_status.err:
-             logging.warning("PDF conversion issues detected - attempting image fallback")
              return self.fallback_pdf_generation(html_report)
         return pdf_buffer.getvalue()
 def handle_generate_report(query_name: str, user_name: str, final_report: str):

 from reportlab.lib.pagesizes import A4
 from xhtml2pdf import pisa
 import chromedriver_autoinstaller
 # Set up logging basic configuration
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # Helper functions for external APIs and PDF Processing
 # =============================================================================
+def replace_focus_placeholders(report_html: str, context: str, initial_query: str, crumbs: str) -> str:
+    pattern = r"\[\[Focus Placeholder (\d+):(.*?)\]\]"
+    def placeholder_replacer(match):
+        placeholder_num = match.group(1)
+        instructions = match.group(2).strip()
+        logging.info(f"Generating focus box {placeholder_num}")
+        try:
+            focus_html = generate_focus_snippet(instructions, context, initial_query, crumbs)
+            # Wrap the entire focus placeholder in a single div to preserve block-level grouping.
+            return f'<!-- Focus {placeholder_num} Start --><div class="focus-placeholder" style="background: #ede8e8; border: 1px solid black; padding: 10px; margin: 10px 0;">\n{focus_html}\n</div><!-- Focus {placeholder_num} End -->'
+        except Exception as e:
+            logging.error(f"Focus {placeholder_num} failed: {str(e)}")
+            return f'<!-- ERROR GENERATING FOCUS {placeholder_num} -->'
+    return re.sub(pattern, placeholder_replacer, report_html, flags=re.DOTALL)
 def generate_visual_snippet(placeholder_text: str, context: str, initial_query: str, crumbs: str) -> str:
     prompt = (f"""
 - White background (#ffffff)
 - overall dimension capped at 500px x 500px
 - no introduction, conclusions or code fences -> Output the result directly
+- create only the content for the mermaid (do not add comments of #color coding and stuffs inside the mermaid code), it's supposed to be only focused on the mermaid code required to render it
 // Important
 - Make the visuals content rich, there's no point having a visual if its content has no real value.
     return all(checks)
 class ReportGenerator:
+    def __init__(self, render_with_selenium: bool = False):
+        # Flag to determine if we are rendering the final PDF with Selenium
+        self.render_with_selenium = render_with_selenium
     def generate_report_html(self, solution_content: str, metadata: dict = None) -> str:
+        # Normalize text and fix dash characters.
         solution_content = unicodedata.normalize('NFKC', solution_content)
         solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
         solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
         html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
         html_content = html_content.replace("<h2>Table of Contents</h2>", "<div class='page-break'></div><h2>Table of Contents</h2>")
         html_content = html_content.replace("<h2>Introduction</h2>", "<div class='page-break'></div><h2>Introduction</h2>")
         html_content = html_content.replace("<h2>Conclusion</h2>", "<div class='page-break'></div><h2>Conclusion</h2>")
         html_content = html_content.replace("<h2>References</h2>", "<div class='page-break'></div><h2>References</h2>")
         html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>", "<div class='page-break'></div><h2>Surprise-Me Extension Report</h2>")
         date_str = datetime.now().strftime("%Y-%m-%d")
         header = ""
         if metadata:
 <p>Author: {metadata.get('User name', 'N/A')}</p>
 <p>Date: {metadata.get('Date', date_str)}</p>
 <hr/>"""
+        # Force a white background for the entire page (overriding any light grey)
         full_html = f"""
 <html>
 <head>
     <meta charset="utf-8" />
     <style>
+        body {{ font-family: Helvetica, sans-serif; margin: 40px; background: white; }}
         h1 {{ font-size: 24pt; margin-bottom: 12px; text-align: left; }}
         h2 {{ font-size: 20pt; margin-bottom: 10px; text-align: left; }}
         h3 {{ font-size: 18pt; margin-bottom: 8px; text-align: left; }}
         .page-break {{ page-break-before: always; }}
     </style>
     <script>
         console.log("Report loaded successfully.");
     </script>
 </head>
         options.add_argument("--headless")
         options.add_argument("--disable-gpu")
         options.add_argument("--no-sandbox")
+        options.add_argument("--window-size=1920,1080")
         driver = webdriver.Chrome(options=options)
         try:
             driver.get(f"data:text/html;charset=utf-8,{html_content}")
+            time.sleep(2)
             screenshot_png = driver.get_screenshot_as_png()
             return screenshot_png
         finally:
+            driver.quit()
     def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
         # Generate the full HTML report
         html_report = self.generate_report_html(solution_content, metadata)
+        # Optionally pre-process the HTML only if we are converting via pisa.
+        if not self.render_with_selenium:
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(html_report, "html.parser")
+            # Replace mermaid iframes with placeholders only if not using Selenium (since Selenium can render JS)
+            for iframe in soup.find_all("iframe"):
+                srcdoc = iframe.get("srcdoc", "")
+                if "mermaid" in srcdoc:
+                    placeholder = soup.new_tag("div", **{"class": "mermaid-placeholder"})
+                    placeholder.string = "Mermaid diagram placeholder (not rendered in PDF)"
+                    iframe.replace_with(placeholder)
+            html_report = str(soup)
+            # Inject CSS for mermaid placeholders
+            html_report = html_report.replace("<style>", """<style>
+            @media print {
+                 .mermaid-placeholder {
+                    width: 500px;
+                    height: 500px;
+                    border: 1px solid #ccc;
+                    display: flex;
+                    align-items: center;
+                    justify-content: center;
+                    font-size: 12pt;
+                    color: #666;
+                    margin: 10px auto;
+                 }
+                 .visual-container { page-break-inside: avoid; }
+                 svg { max-width: 100% !important; height: auto !important; }
+            }
+            """)
+        # Wrap the alignment assessment (if present) in a div to control overflow.
+        html_report = html_report.replace(
+            "<p><b>Report alignment assessment:</b>",
+            "<div style='max-width:100%; word-wrap: break-word;'><p><b>Report alignment assessment:</b>"
+        )
+        # Ensure closing tag for the added div.
+        html_report = html_report.replace("</body>", "</div></body>")
+        # Convert HTML to PDF using xhtml2pdf.
+        import io
         pdf_buffer = io.BytesIO()
         pisa_status = pisa.CreatePDF(html_report, dest=pdf_buffer)
         if pisa_status.err:
+             logging.warning("PDF conversion issues detected - attempting Selenium fallback")
              return self.fallback_pdf_generation(html_report)
         return pdf_buffer.getvalue()
 def handle_generate_report(query_name: str, user_name: str, final_report: str):