Guiyom commited on
Commit
6986d5c
·
verified ·
1 Parent(s): 9a02e15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +210 -103
app.py CHANGED
@@ -2892,10 +2892,10 @@ def validate_visual_html(html: str) -> bool:
2892
 
2893
  class ReportGenerator:
2894
  def __init__(self, render_with_selenium: bool = False):
2895
- # Flag to determine if we are rendering the final PDF with Selenium
2896
  self.render_with_selenium = render_with_selenium
2897
 
2898
- def generate_report_html(self, solution_content: str, metadata: dict = None) -> str:
2899
  # Normalize text and fix dash characters.
2900
  solution_content = unicodedata.normalize('NFKC', solution_content)
2901
  solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
@@ -2907,118 +2907,205 @@ class ReportGenerator:
2907
  html_content = html_content.replace("<h2>References</h2>", "<div class='page-break'></div><h2>References</h2>")
2908
  html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>", "<div class='page-break'></div><h2>Surprise-Me Extension Report</h2>")
2909
 
 
 
 
 
 
 
 
 
2910
  date_str = datetime.now().strftime("%Y-%m-%d")
2911
  header = ""
2912
  if metadata:
2913
- header = f"""<h1>Search Query: {metadata.get('Query name', 'N/A')}</h1>
2914
- <p>Author: {metadata.get('User name', 'N/A')}</p>
2915
- <p>Date: {metadata.get('Date', date_str)}</p>
2916
- <hr/>"""
2917
- # Force a white background for the entire page (overriding any light grey)
2918
- full_html = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2919
  <html>
2920
- <head>
2921
  <meta charset="utf-8" />
 
2922
  <style>
2923
- body {{ font-family: Helvetica, sans-serif; margin: 40px; background: white; }}
2924
- h1 {{ font-size: 24pt; margin-bottom: 12px; text-align: left; }}
2925
- h2 {{ font-size: 20pt; margin-bottom: 10px; text-align: left; }}
2926
- h3 {{ font-size: 18pt; margin-bottom: 8px; text-align: left; }}
2927
- /* Force paragraphs to have a white background */
2928
- p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; background-color: white !important; }}
2929
- ol, ul {{ font-size: 11pt; margin-left: 20px; line-height: 1.5; }}
2930
- h2 + ol {{ line-height: 1 !important; }}
2931
- hr {{ border: 1px solid #ccc; margin: 20px 0; }}
2932
- table {{ border-collapse: collapse; width: 100%; margin-bottom: 10px; }}
2933
- th, td {{ border: 1px solid #ccc; padding: 8px; text-align: left; }}
2934
- th {{ background-color: #f2f2f2; }}
2935
- .page-break {{ page-break-before: always; }}
 
2936
  </style>
2937
- <script>
2938
- console.log("Report loaded successfully.");
2939
- </script>
2940
- </head>
2941
- <body>
2942
- {header}
2943
- {html_content}
2944
- </body>
2945
  </html>
2946
  """
2947
- logging.info("ReportGenerator: HTML report generated successfully.")
2948
- return full_html
2949
-
2950
- def fallback_pdf_generation(self, html_content: str) -> bytes:
2951
- from selenium import webdriver
2952
- from selenium.webdriver.chrome.options import Options
2953
- import time
2954
- import io
2955
- options = Options()
2956
- options.add_argument("--headless")
2957
- options.add_argument("--disable-gpu")
2958
- options.add_argument("--no-sandbox")
2959
- options.add_argument("--window-size=1920,1080")
2960
- driver = webdriver.Chrome(options=options)
2961
- try:
2962
- driver.get(f"data:text/html;charset=utf-8,{html_content}")
2963
- time.sleep(10)
2964
- screenshot_png = driver.get_screenshot_as_png()
2965
- return screenshot_png
2966
- finally:
2967
- driver.quit()
2968
-
2969
-
2970
- def generate_pdf_with_mermaid(self, html_report: str) -> bytes:
2971
- """
2972
- This method processes the given HTML report, extracts all mermaid diagrams
2973
- (assumed to be rendered as SVG inside a <div class="mermaid">), and uses
2974
- svg-to-PDFKit to embed each diagram into a PDF document.
2975
- """
2976
- from pdfkit import PDFDocument # hypothetical module import
2977
- from blobstream import BlobStream # hypothetical; adapt as needed
2978
- from svg2pdfkit import SVGtoPDF # pseudo-import; adjust to your environment
2979
-
2980
- # Create a new PDF document using PDFKit's PDFDocument (or equivalent)
2981
- doc = PDFDocument({ "compress": True })
2982
- # Keep track of Y-offset for inserting each diagram
2983
- current_y = doc.y if hasattr(doc, "y") else 0
2984
-
2985
- # Parse the HTML to extract mermaid diagrams
2986
- soup = BeautifulSoup(html_report, "html.parser")
2987
- mermaid_elements = soup.find_all("div", class_="mermaid")
2988
- for mermaid in mermaid_elements:
2989
- # Extract the SVG content. (Assumes your mermaid container produces SVG markup.)
2990
- svg_content = mermaid.decode_contents()
2991
- # Here, you may want to clean the SVG (remove any extra tags).
2992
- svg_content = re.sub(r"<\/?(html|head|body)[^>]*>", "", svg_content, flags=re.DOTALL|re.IGNORECASE).strip()
2993
-
2994
- # Use SVGtoPDF to embed the diagram into the PDF.
2995
- # The x and y coordinates are provided (current_x is 0; update current_y as needed)
2996
- SVGtoPDF(doc, svg_content, 0, current_y, { "useCSS": True })
2997
- # Advance the y-offset by an amount (e.g., 500 points) for the next diagram
2998
- current_y += 500
2999
-
3000
- # Finalize the PDF document
3001
- doc.end()
3002
- stream = doc.pipe(BlobStream())
3003
- # Here you would normally wait for the "finish" event, but for simplicity assume synchronous behavior:
3004
- pdf_bytes = stream.getvalue() # Adjust as needed to extract the PDF byte content
3005
- return pdf_bytes
3006
-
3007
- def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
3008
- html_report = self.generate_report_html(solution_content, metadata)
3009
- # Check if the report contains mermaid diagrams
3010
- if 'class="mermaid"' in html_report:
3011
- try:
3012
- pdf_bytes = self.generate_pdf_with_mermaid(html_report)
3013
- return pdf_bytes
3014
- except Exception as e:
3015
- logging.error("Error in mermaid PDF generation, falling back to xhtml2pdf: " + str(e))
3016
- # Otherwise, use your existing pipeline (using xhtml2pdf, Selenium fallback, etc.)
3017
- import io
3018
  pdf_buffer = io.BytesIO()
3019
- pisa_status = pisa.CreatePDF(html_report, dest=pdf_buffer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3020
  if pisa_status.err:
3021
- return self.fallback_pdf_generation(html_report)
 
3022
  return pdf_buffer.getvalue()
3023
 
3024
  def handle_generate_report(query_name: str, user_name: str, final_report: str):
@@ -3041,6 +3128,26 @@ def handle_generate_report(query_name: str, user_name: str, final_report: str):
3041
  logging.error(f"handle_generate_report error: {e}", exc_info=True)
3042
  return f"Error generating report: {str(e)}", None
3043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3044
  def extract_summary_from_crumbs(crumbs_list: list) -> str:
3045
  """
3046
  Given a list of crumb records (each with 'url', 'summary', and 'full_content'),
 
2892
 
2893
  class ReportGenerator:
2894
  def __init__(self, render_with_selenium: bool = False):
2895
+ # Flag to determine if we are rendering the final PDF using Selenium
2896
  self.render_with_selenium = render_with_selenium
2897
 
2898
+ def generate_report_html(self, solution_content: str) -> str:
2899
  # Normalize text and fix dash characters.
2900
  solution_content = unicodedata.normalize('NFKC', solution_content)
2901
  solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
 
2907
  html_content = html_content.replace("<h2>References</h2>", "<div class='page-break'></div><h2>References</h2>")
2908
  html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>", "<div class='page-break'></div><h2>Surprise-Me Extension Report</h2>")
2909
 
2910
+ logging.info(f"ReportGenerator: HTML report generated successfully:\n{html_content}")
2911
+ return html_content
2912
+
2913
+ def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
2914
+ # Generate the full HTML report (including text, focus placeholders, and visuals as iframes)
2915
+ html_report = self.generate_report_html(solution_content)
2916
+
2917
+ # Add header
2918
  date_str = datetime.now().strftime("%Y-%m-%d")
2919
  header = ""
2920
  if metadata:
2921
+ header = f"<p>Search Query: {metadata.get('Query name', 'N/A')}<br>Author: {metadata.get('User name', 'N/A')} | Date: {metadata.get('Date', date_str)}</p>"
2922
+ soup = BeautifulSoup(html_report, "html.parser")
2923
+ body_tag = soup.body
2924
+ if body_tag:
2925
+ body_tag.insert(0, BeautifulSoup(header, "html.parser"))
2926
+ updated_html = str(soup)
2927
+
2928
+ # Parse the HTML
2929
+ logging.info(f"ReportGenerator: soup report generated:\n{soup}")
2930
+
2931
+ # Find all mermaid visual iframes (assumed to have class "visual-frame")
2932
+ visual_iframes = soup.find_all("iframe", class_="visual-frame")
2933
+
2934
+ if visual_iframes:
2935
+ # Set up Selenium with a window size and high DPI for better image resolution
2936
+ import base64, tempfile, time
2937
+ import chromedriver_autoinstaller
2938
+ chromedriver_autoinstaller.install()
2939
+ # (Removed the explicit print statement to keep logs clean)
2940
+ from selenium import webdriver
2941
+ from selenium.webdriver.chrome.options import Options
2942
+ from selenium.webdriver.chrome.service import Service
2943
+ options = Options()
2944
+ options.add_argument("--headless")
2945
+ options.add_argument("--no-sandbox")
2946
+ options.add_argument("--disable-dev-shm-usage")
2947
+ options.add_argument("--window-size=1200,1200")
2948
+ options.add_argument("--force-device-scale-factor=2")
2949
+ service = Service(log_path=os.devnull)
2950
+ driver = webdriver.Chrome(service=service, options=options)
2951
+
2952
+ for iframe in visual_iframes:
2953
+ # Assume the iframe has its content in srcdoc (as generated in generate_visual_snippet)
2954
+ srcdoc = iframe.get("srcdoc")
2955
+ if srcdoc:
2956
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_file:
2957
+ tmp_file.write(srcdoc.encode("utf-8"))
2958
+ tmp_file.flush()
2959
+ file_url = "file://" + tmp_file.name
2960
+
2961
+ driver.get(file_url)
2962
+ time.sleep(3) # Allow time for JavaScript (e.g., mermaid) to render
2963
+ screenshot_png = driver.get_screenshot_as_png()
2964
+
2965
+ # Optional: Crop the screenshot to remove extra whitespace:
2966
+ from PIL import Image
2967
+ from io import BytesIO
2968
+ img = Image.open(BytesIO(screenshot_png))
2969
+ cropped_img = img.crop(img.getbbox())
2970
+ buffer = BytesIO()
2971
+ cropped_img.save(buffer, format='PNG')
2972
+ cropped_png = buffer.getvalue()
2973
+
2974
+ b64_img = base64.b64encode(cropped_png).decode("utf-8")
2975
+ new_tag = soup.new_tag("img")
2976
+ # Add page-break style to avoid forcing a new page after the image when generating the PDF.
2977
+ new_tag["style"] = "max-width: 500px; display: block; margin: auto; page-break-after: avoid;"
2978
+ new_tag["src"] = "data:image/png;base64," + b64_img
2979
+ iframe.replace_with(new_tag)
2980
+ driver.quit()
2981
+
2982
+ # Instead of converting the entire soup (which may include nested <html> tags), extract only the content within <body>
2983
+ body_tag = soup.find("body")
2984
+ body_content = body_tag.decode_contents() if body_tag else ""
2985
+
2986
+ # Reassemble a clean, single HTML document with our desired CSS (preserving line breaks)
2987
+ final_html = f"""
2988
  <html>
2989
+ <head>
2990
  <meta charset="utf-8" />
2991
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
2992
  <style>
2993
+ body {{ font-family: Helvetica, sans-serif; margin: 40px; background: white; }}
2994
+ h1 {{ font-size: 20pt; margin-bottom: 12px; text-align: left; font-weight: bold;}}
2995
+ h2 {{ font-size: 16pt; margin-bottom: 10px; text-align: left; font-weight: bold;}}
2996
+ h3 {{ font-size: 14pt; margin-bottom: 8px; text-align: left; font-weight: bold;}}
2997
+ h4 {{ font-size: 12pt; text-align: left; font-weight: bold;}}
2998
+ p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; white-space: pre-wrap; }}
2999
+ table {{ border: 1px solid black; }}
3000
+ pre, div {{ white-space: pre-wrap; }}
3001
+ ol, ul {{ font-size: 11pt; margin-left: 20px; line-height: 1.5; }}
3002
+ hr {{ border: 1px solid #ccc; margin: 20px 0; }}
3003
+ table {{ border-collapse: collapse; width: 100%; margin-bottom: 10px; }}
3004
+ th, td {{ border: 1px solid #ccc; padding: 8px; text-align: left; }}
3005
+ th {{ background-color: #f2f2f2; }}
3006
+ .page-break {{ page-break-before: always; }}
3007
  </style>
3008
+ </head>
3009
+ <body>
3010
+ {body_content}
3011
+ </body>
 
 
 
 
3012
  </html>
3013
  """
3014
+ # Preprocessing for log display
3015
+ def remove_img_tags(input_string):
3016
+ # Regex pattern to match <img> tags with any src attribute
3017
+ pattern = r'<img src=.*?>'
3018
+ # Replace all occurrences with an empty string
3019
+ cleaned_string = re.sub(pattern, '', input_string, flags=re.MULTILINE)
3020
+ return cleaned_string
3021
+ cleaned_string = remove_img_tags(final_html)
3022
+ logging.info(f"ReportGenerator: Final HTML for PDF conversion:\n{cleaned_string}")
3023
+
3024
+ # Crafting compliance
3025
+ final_html = final_html.replace("<h1","<br><br><br><h1").replace("</h1>","</h1><br>")
3026
+ final_html = final_html.replace("<h2","<br><br><b><h2").replace("</h2>","</b></h2><br>")
3027
+ final_html = final_html.replace("<h3","<br><br><h3").replace("</h3>","</b></h3><br>")
3028
+ final_html = final_html.replace("<h4","<br><h4")
3029
+ final_html = final_html.replace("<div","<br><div")
3030
+ final_html = final_html.replace("<table>","<br><table>")
3031
+
3032
+ # Generate the final PDF from final_html using xhtml2pdf (A4 layout)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3033
  pdf_buffer = io.BytesIO()
3034
+ pisa_status = pisa.CreatePDF(final_html, dest=pdf_buffer,
3035
+ link_callback=lambda uri, rel: uri,
3036
+ default_css="""
3037
+ @page {
3038
+ size: A4;
3039
+ margin: 0.5in;
3040
+ }
3041
+ body {
3042
+ font-family: Helvetica, sans-serif;
3043
+ background: white;
3044
+ margin: 40px;
3045
+ padding: 0;
3046
+ }
3047
+ h1 {
3048
+ font-size: 20pt;
3049
+ margin-bottom: 12px;
3050
+ text-align: left;
3051
+ font-weight: bold;
3052
+ }
3053
+ h2 {
3054
+ font-size: 16pt;
3055
+ margin-bottom: 10px;
3056
+ text-align: left;
3057
+ font-weight: bold;
3058
+ }
3059
+ h3 {
3060
+ font-size: 14pt;
3061
+ margin-bottom: 8px;
3062
+ text-align: left;
3063
+ font-weight: bold;
3064
+ }
3065
+ h4 {
3066
+ font-size: 12pt;
3067
+ text-align: left;
3068
+ font-weight: bold;
3069
+ }
3070
+ table {
3071
+ border: 1px solid black;
3072
+ }
3073
+ p {
3074
+ font-size: 11pt;
3075
+ line-height: 1.5;
3076
+ margin-bottom: 10px;
3077
+ }
3078
+ pre, div {
3079
+ }
3080
+ ol, ul {
3081
+ font-size: 11pt;
3082
+ margin-left: 20px;
3083
+ line-height: 1.5;
3084
+ }
3085
+ hr {
3086
+ border: 1px solid #ccc;
3087
+ margin: 20px 0;
3088
+ }
3089
+ table {
3090
+ border-collapse: collapse;
3091
+ width: 100%;
3092
+ margin-bottom: 10px;
3093
+ }
3094
+ th, td {
3095
+ border: 1px solid #ccc;
3096
+ padding: 8px;
3097
+ text-align: left;
3098
+ }
3099
+ th {
3100
+ background-color: #f2f2f2;
3101
+ }
3102
+ .page-break {
3103
+ page-break-before: always;
3104
+ }
3105
+ """)
3106
  if pisa_status.err:
3107
+ logging.error("Error generating PDF with xhtml2pdf.")
3108
+ return None
3109
  return pdf_buffer.getvalue()
3110
 
3111
  def handle_generate_report(query_name: str, user_name: str, final_report: str):
 
3128
  logging.error(f"handle_generate_report error: {e}", exc_info=True)
3129
  return f"Error generating report: {str(e)}", None
3130
 
3131
+ def handle_generate_report(query_name: str, user_name: str, final_report: str):
3132
+ try:
3133
+ report_generator = ReportGenerator(render_with_selenium=False)
3134
+ metadata = {
3135
+ "Query name": query_name,
3136
+ "User name": user_name,
3137
+ "Date": datetime.now().strftime("%Y-%m-%d"),
3138
+ "Time": datetime.now().strftime("%H:%M:%S"),
3139
+ }
3140
+
3141
+ pdf_bytes = report_generator.generate_report_pdf(solution_content=final_report, metadata=metadata)
3142
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
3143
+ tmp_file.write(pdf_bytes)
3144
+ tmp_path = tmp_file.name
3145
+ logging.info(f"handle_generate_report: PDF report generated at {tmp_path}")
3146
+ return "Report generated successfully.", gr.update(value=tmp_path, visible=True)
3147
+ except Exception as e:
3148
+ logging.error(f"handle_generate_report error: {e}", exc_info=True)
3149
+ return f"Error generating report: {str(e)}", None
3150
+
3151
  def extract_summary_from_crumbs(crumbs_list: list) -> str:
3152
  """
3153
  Given a list of crumb records (each with 'url', 'summary', and 'full_content'),