Guiyom commited on
Commit
39895bb
·
verified ·
1 Parent(s): 0417aa5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -75
app.py CHANGED
@@ -17,8 +17,6 @@ from datetime import datetime
17
  from reportlab.lib.pagesizes import A4
18
  from xhtml2pdf import pisa
19
  import chromedriver_autoinstaller
20
- from selenium import webdriver
21
- from selenium.webdriver.chrome.options import Options
22
 
23
  # Set up logging basic configuration
24
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -32,27 +30,20 @@ MAX_MESSAGE_LENGTH = 1048576
32
  # Helper functions for external APIs and PDF Processing
33
  # =============================================================================
34
 
35
- def fallback_pdf_generation(html_content: str) -> bytes:
36
- """Convert HTML to PDF using a screenshot fallback via Selenium."""
37
- # Automatically install ChromeDriver if not found
38
- chromedriver_autoinstaller.install()
39
-
40
- options = Options()
41
- options.add_argument("--headless")
42
- options.add_argument("--disable-gpu")
43
- options.add_argument("--no-sandbox")
44
- options.add_argument("--window-size=1920,1080") # Desired window size
45
-
46
- # Initialize the WebDriver which now uses the auto-installed ChromeDriver
47
- driver = webdriver.Chrome(options=options)
48
- try:
49
- # Load the HTML content directly using a data URL.
50
- driver.get(f"data:text/html;charset=utf-8,{html_content}")
51
- time.sleep(2) # Allow time for dynamic content to render
52
- screenshot_png = driver.get_screenshot_as_png()
53
- return screenshot_png
54
- finally:
55
- driver.quit()
56
 
57
  def generate_visual_snippet(placeholder_text: str, context: str, initial_query: str, crumbs: str) -> str:
58
  prompt = (f"""
@@ -74,6 +65,7 @@ Keep in mind the:
74
  - White background (#ffffff)
75
  - overall dimension capped at 500px x 500px
76
  - no introduction, conclusions or code fences -> Output the result directly
 
77
 
78
  // Important
79
  - Make the visuals content rich, there's no point having a visual if its content has no real value.
@@ -822,25 +814,22 @@ def validate_visual_html(html: str) -> bool:
822
  return all(checks)
823
 
824
  class ReportGenerator:
825
- def __init__(self):
826
- pass
 
827
 
828
  def generate_report_html(self, solution_content: str, metadata: dict = None) -> str:
829
- # Normalize text and replace problematic dash characters with standard hyphen.
830
  solution_content = unicodedata.normalize('NFKC', solution_content)
831
  solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
832
- # Remove markdown hyperlink syntax: replace [text](link) with just text.
833
  solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
834
- # Convert markdown to HTML using the "extra" and "tables" extensions to support numbering and table syntax.
835
  html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
836
- # Insert explicit breaks for main report sections as needed.
837
  html_content = html_content.replace("<h2>Table of Contents</h2>", "<div class='page-break'></div><h2>Table of Contents</h2>")
838
  html_content = html_content.replace("<h2>Introduction</h2>", "<div class='page-break'></div><h2>Introduction</h2>")
839
  html_content = html_content.replace("<h2>Conclusion</h2>", "<div class='page-break'></div><h2>Conclusion</h2>")
840
  html_content = html_content.replace("<h2>References</h2>", "<div class='page-break'></div><h2>References</h2>")
841
  html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>", "<div class='page-break'></div><h2>Surprise-Me Extension Report</h2>")
842
 
843
- # Build header using metadata if provided.
844
  date_str = datetime.now().strftime("%Y-%m-%d")
845
  header = ""
846
  if metadata:
@@ -848,13 +837,13 @@ class ReportGenerator:
848
  <p>Author: {metadata.get('User name', 'N/A')}</p>
849
  <p>Date: {metadata.get('Date', date_str)}</p>
850
  <hr/>"""
851
- # Build a complete HTML document with CSS and JS (if needed)
852
  full_html = f"""
853
  <html>
854
  <head>
855
  <meta charset="utf-8" />
856
  <style>
857
- body {{ font-family: Helvetica, sans-serif; margin: 40px; }}
858
  h1 {{ font-size: 24pt; margin-bottom: 12px; text-align: left; }}
859
  h2 {{ font-size: 20pt; margin-bottom: 10px; text-align: left; }}
860
  h3 {{ font-size: 18pt; margin-bottom: 8px; text-align: left; }}
@@ -867,7 +856,6 @@ class ReportGenerator:
867
  .page-break {{ page-break-before: always; }}
868
  </style>
869
  <script>
870
- // You may add your JavaScript here if needed.
871
  console.log("Report loaded successfully.");
872
  </script>
873
  </head>
@@ -891,64 +879,69 @@ class ReportGenerator:
891
  options.add_argument("--headless")
892
  options.add_argument("--disable-gpu")
893
  options.add_argument("--no-sandbox")
894
- options.add_argument("--window-size=1920,1080") # Set desired window size
895
 
896
- # Ensure you have ChromeDriver installed and in your PATH.
897
  driver = webdriver.Chrome(options=options)
898
  try:
899
- # Load the HTML content directly using a data URL.
900
  driver.get(f"data:text/html;charset=utf-8,{html_content}")
901
- time.sleep(2) # Allow time for any dynamic content to render
902
  screenshot_png = driver.get_screenshot_as_png()
903
  return screenshot_png
904
  finally:
905
- driver.quit()
 
906
 
907
  def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
908
  # Generate the full HTML report
909
  html_report = self.generate_report_html(solution_content, metadata)
910
 
911
- # Pre-process the HTML: replace dynamic mermaid iframes with static placeholders.
912
- from bs4 import BeautifulSoup
913
- soup = BeautifulSoup(html_report, "html.parser")
914
- for iframe in soup.find_all("iframe"):
915
- srcdoc = iframe.get("srcdoc", "")
916
- if "mermaid" in srcdoc:
917
- # Create a static placeholder div to indicate a mermaid diagram here.
918
- placeholder = soup.new_tag("div", **{"class": "mermaid-placeholder"})
919
- placeholder.string = "Mermaid diagram placeholder (not rendered in PDF)"
920
- iframe.replace_with(placeholder)
921
- # Convert the modified soup back into a string
922
- html_report = str(soup)
923
-
924
- # Inject PDF-specific CSS with support for the placeholder.
925
- html_report = html_report.replace("<style>", """<style>
926
- @media print {
927
- .mermaid-placeholder {
928
- width: 500px;
929
- height: 500px;
930
- border: 1px solid #ccc;
931
- display: flex;
932
- align-items: center;
933
- justify-content: center;
934
- font-size: 12pt;
935
- color: #666;
936
- margin: 10px auto;
937
- }
938
- .visual-container { page-break-inside: avoid; }
939
- svg { max-width: 100% !important; height: auto !important; }
940
- }
941
- """)
942
-
943
- # Convert to PDF using xhtml2pdf (pisa)
 
 
 
 
 
 
 
944
  pdf_buffer = io.BytesIO()
945
  pisa_status = pisa.CreatePDF(html_report, dest=pdf_buffer)
946
-
947
- # If errors are found during PDF conversion, use the fallback method.
948
  if pisa_status.err:
949
- logging.warning("PDF conversion issues detected - attempting image fallback")
950
  return self.fallback_pdf_generation(html_report)
951
-
952
  return pdf_buffer.getvalue()
953
 
954
  def handle_generate_report(query_name: str, user_name: str, final_report: str):
 
17
  from reportlab.lib.pagesizes import A4
18
  from xhtml2pdf import pisa
19
  import chromedriver_autoinstaller
 
 
20
 
21
  # Set up logging basic configuration
22
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
30
  # Helper functions for external APIs and PDF Processing
31
  # =============================================================================
32
 
33
+ def replace_focus_placeholders(report_html: str, context: str, initial_query: str, crumbs: str) -> str:
34
+ pattern = r"\[\[Focus Placeholder (\d+):(.*?)\]\]"
35
+ def placeholder_replacer(match):
36
+ placeholder_num = match.group(1)
37
+ instructions = match.group(2).strip()
38
+ logging.info(f"Generating focus box {placeholder_num}")
39
+ try:
40
+ focus_html = generate_focus_snippet(instructions, context, initial_query, crumbs)
41
+ # Wrap the entire focus placeholder in a single div to preserve block-level grouping.
42
+ return f'<!-- Focus {placeholder_num} Start --><div class="focus-placeholder" style="background: #ede8e8; border: 1px solid black; padding: 10px; margin: 10px 0;">\n{focus_html}\n</div><!-- Focus {placeholder_num} End -->'
43
+ except Exception as e:
44
+ logging.error(f"Focus {placeholder_num} failed: {str(e)}")
45
+ return f'<!-- ERROR GENERATING FOCUS {placeholder_num} -->'
46
+ return re.sub(pattern, placeholder_replacer, report_html, flags=re.DOTALL)
 
 
 
 
 
 
 
47
 
48
  def generate_visual_snippet(placeholder_text: str, context: str, initial_query: str, crumbs: str) -> str:
49
  prompt = (f"""
 
65
  - White background (#ffffff)
66
  - overall dimension capped at 500px x 500px
67
  - no introduction, conclusions or code fences -> Output the result directly
68
+ - create only the content for the mermaid (do not add comments of #color coding and stuffs inside the mermaid code), it's supposed to be only focused on the mermaid code required to render it
69
 
70
  // Important
71
  - Make the visuals content rich, there's no point having a visual if its content has no real value.
 
814
  return all(checks)
815
 
816
  class ReportGenerator:
817
+ def __init__(self, render_with_selenium: bool = False):
818
+ # Flag to determine if we are rendering the final PDF with Selenium
819
+ self.render_with_selenium = render_with_selenium
820
 
821
  def generate_report_html(self, solution_content: str, metadata: dict = None) -> str:
822
+ # Normalize text and fix dash characters.
823
  solution_content = unicodedata.normalize('NFKC', solution_content)
824
  solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
 
825
  solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
 
826
  html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
 
827
  html_content = html_content.replace("<h2>Table of Contents</h2>", "<div class='page-break'></div><h2>Table of Contents</h2>")
828
  html_content = html_content.replace("<h2>Introduction</h2>", "<div class='page-break'></div><h2>Introduction</h2>")
829
  html_content = html_content.replace("<h2>Conclusion</h2>", "<div class='page-break'></div><h2>Conclusion</h2>")
830
  html_content = html_content.replace("<h2>References</h2>", "<div class='page-break'></div><h2>References</h2>")
831
  html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>", "<div class='page-break'></div><h2>Surprise-Me Extension Report</h2>")
832
 
 
833
  date_str = datetime.now().strftime("%Y-%m-%d")
834
  header = ""
835
  if metadata:
 
837
  <p>Author: {metadata.get('User name', 'N/A')}</p>
838
  <p>Date: {metadata.get('Date', date_str)}</p>
839
  <hr/>"""
840
+ # Force a white background for the entire page (overriding any light grey)
841
  full_html = f"""
842
  <html>
843
  <head>
844
  <meta charset="utf-8" />
845
  <style>
846
+ body {{ font-family: Helvetica, sans-serif; margin: 40px; background: white; }}
847
  h1 {{ font-size: 24pt; margin-bottom: 12px; text-align: left; }}
848
  h2 {{ font-size: 20pt; margin-bottom: 10px; text-align: left; }}
849
  h3 {{ font-size: 18pt; margin-bottom: 8px; text-align: left; }}
 
856
  .page-break {{ page-break-before: always; }}
857
  </style>
858
  <script>
 
859
  console.log("Report loaded successfully.");
860
  </script>
861
  </head>
 
879
  options.add_argument("--headless")
880
  options.add_argument("--disable-gpu")
881
  options.add_argument("--no-sandbox")
882
+ options.add_argument("--window-size=1920,1080")
883
 
 
884
  driver = webdriver.Chrome(options=options)
885
  try:
 
886
  driver.get(f"data:text/html;charset=utf-8,{html_content}")
887
+ time.sleep(2)
888
  screenshot_png = driver.get_screenshot_as_png()
889
  return screenshot_png
890
  finally:
891
+ driver.quit()
892
+
893
 
894
  def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
895
  # Generate the full HTML report
896
  html_report = self.generate_report_html(solution_content, metadata)
897
 
898
+ # Optionally pre-process the HTML only if we are converting via pisa.
899
+ if not self.render_with_selenium:
900
+ from bs4 import BeautifulSoup
901
+ soup = BeautifulSoup(html_report, "html.parser")
902
+ # Replace mermaid iframes with placeholders only if not using Selenium (since Selenium can render JS)
903
+ for iframe in soup.find_all("iframe"):
904
+ srcdoc = iframe.get("srcdoc", "")
905
+ if "mermaid" in srcdoc:
906
+ placeholder = soup.new_tag("div", **{"class": "mermaid-placeholder"})
907
+ placeholder.string = "Mermaid diagram placeholder (not rendered in PDF)"
908
+ iframe.replace_with(placeholder)
909
+ html_report = str(soup)
910
+ # Inject CSS for mermaid placeholders
911
+ html_report = html_report.replace("<style>", """<style>
912
+ @media print {
913
+ .mermaid-placeholder {
914
+ width: 500px;
915
+ height: 500px;
916
+ border: 1px solid #ccc;
917
+ display: flex;
918
+ align-items: center;
919
+ justify-content: center;
920
+ font-size: 12pt;
921
+ color: #666;
922
+ margin: 10px auto;
923
+ }
924
+ .visual-container { page-break-inside: avoid; }
925
+ svg { max-width: 100% !important; height: auto !important; }
926
+ }
927
+ """)
928
+ # Wrap the alignment assessment (if present) in a div to control overflow.
929
+ html_report = html_report.replace(
930
+ "<p><b>Report alignment assessment:</b>",
931
+ "<div style='max-width:100%; word-wrap: break-word;'><p><b>Report alignment assessment:</b>"
932
+ )
933
+ # Ensure closing tag for the added div.
934
+ html_report = html_report.replace("</body>", "</div></body>")
935
+
936
+ # Convert HTML to PDF using xhtml2pdf.
937
+ import io
938
  pdf_buffer = io.BytesIO()
939
  pisa_status = pisa.CreatePDF(html_report, dest=pdf_buffer)
940
+
 
941
  if pisa_status.err:
942
+ logging.warning("PDF conversion issues detected - attempting Selenium fallback")
943
  return self.fallback_pdf_generation(html_report)
944
+
945
  return pdf_buffer.getvalue()
946
 
947
  def handle_generate_report(query_name: str, user_name: str, final_report: str):