Guiyom commited on
Commit
01a84ee
·
verified ·
1 Parent(s): 01d7266

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -156
app.py CHANGED
@@ -21,6 +21,66 @@ TOTAL_SUMMARIZED_WORDS = 0
21
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
22
  # ============================================================================= Helper functions
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def process_pdf(url: str) -> str:
25
  try:
26
  headers = {"User-Agent": get_random_header()}
@@ -2901,6 +2961,7 @@ class ReportGenerator:
2901
  solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
2902
  solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
2903
  html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
 
2904
  html_content = html_content.replace("<h2>Table of Contents</h2>", "<div class='page-break'></div><h2>Table of Contents</h2>")
2905
  html_content = html_content.replace("<h2>Introduction</h2>", "<div class='page-break'></div><h2>Introduction</h2>")
2906
  html_content = html_content.replace("<h2>Conclusion</h2>", "<div class='page-break'></div><h2>Conclusion</h2>")
@@ -2911,80 +2972,29 @@ class ReportGenerator:
2911
  return html_content
2912
 
2913
  def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
2914
- # Generate the full HTML report (including text, focus placeholders, and mermaid visuals as iframes)
2915
  html_report = self.generate_report_html(solution_content)
2916
 
2917
- # Add header
2918
  date_str = datetime.now().strftime("%Y-%m-%d")
2919
  header = ""
2920
  if metadata:
2921
- header = f"<p>Search Query: {metadata.get('Query name', 'N/A')}<br>Author: {metadata.get('User name', 'N/A')} | Date: {metadata.get('Date', date_str)}</p>"
 
2922
  soup = BeautifulSoup(html_report, "html.parser")
2923
- body_tag = soup.body
2924
- if body_tag:
2925
- body_tag.insert(0, BeautifulSoup(header, "html.parser"))
2926
- updated_html = str(soup)
2927
-
2928
- # Parse the HTML
2929
- logging.info(f"ReportGenerator: soup report generated:\n{soup}")
2930
-
2931
- # Find all mermaid visual iframes (assumed to have class "visual-frame")
2932
- visual_iframes = soup.find_all("iframe", class_="visual-frame")
2933
-
2934
- if visual_iframes:
2935
- # Set up Selenium with a window size and high DPI for better image resolution
2936
- import base64, tempfile, time
2937
- import chromedriver_autoinstaller
2938
- chromedriver_autoinstaller.install()
2939
- # (Removed the explicit print statement to keep logs clean)
2940
- from selenium import webdriver
2941
- from selenium.webdriver.chrome.options import Options
2942
- from selenium.webdriver.chrome.service import Service
2943
- options = Options()
2944
- options.add_argument("--headless")
2945
- options.add_argument("--no-sandbox")
2946
- options.add_argument("--disable-dev-shm-usage")
2947
- options.add_argument("--window-size=1200,1200")
2948
- options.add_argument("--force-device-scale-factor=2")
2949
- service = Service(log_path=os.devnull)
2950
- driver = webdriver.Chrome(service=service, options=options)
2951
-
2952
- for iframe in visual_iframes:
2953
- # Assume the iframe has its content in srcdoc (as generated in generate_visual_snippet)
2954
- srcdoc = iframe.get("srcdoc")
2955
- if srcdoc:
2956
- with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_file:
2957
- tmp_file.write(srcdoc.encode("utf-8"))
2958
- tmp_file.flush()
2959
- file_url = "file://" + tmp_file.name
2960
-
2961
- driver.get(file_url)
2962
- time.sleep(3) # Allow time for JavaScript (e.g., mermaid) to render
2963
- screenshot_png = driver.get_screenshot_as_png()
2964
-
2965
- # Optional: Crop the screenshot to remove extra whitespace:
2966
- from PIL import Image
2967
- from io import BytesIO
2968
- img = Image.open(BytesIO(screenshot_png))
2969
- cropped_img = img.crop(img.getbbox())
2970
- buffer = BytesIO()
2971
- cropped_img.save(buffer, format='PNG')
2972
- cropped_png = buffer.getvalue()
2973
-
2974
- b64_img = base64.b64encode(cropped_png).decode("utf-8")
2975
- new_tag = soup.new_tag("img")
2976
- # Add page-break style to avoid forcing a new page after the image when generating the PDF.
2977
- new_tag["style"] = "max-width: 500px; display: block; margin: auto; page-break-after: avoid;"
2978
- new_tag["src"] = "data:image/png;base64," + b64_img
2979
- iframe.replace_with(new_tag)
2980
- driver.quit()
2981
 
2982
- # Instead of converting the entire soup (which may include nested <html> tags), extract only the content within <body>
2983
  body_tag = soup.find("body")
2984
- body_content = body_tag.decode_contents() if body_tag else ""
2985
-
2986
- # Reassemble a clean, single HTML document with our desired CSS (preserving line breaks)
2987
- final_html = f"""
2988
  <html>
2989
  <head>
2990
  <meta charset="utf-8" />
@@ -3011,101 +3021,16 @@ class ReportGenerator:
3011
  </body>
3012
  </html>
3013
  """
3014
- # Preprocessing for log display
3015
- def remove_img_tags(input_string):
3016
- # Regex pattern to match <img> tags with any src attribute
3017
- pattern = r'<img src=.*?>'
3018
- # Replace all occurrences with an empty string
3019
- cleaned_string = re.sub(pattern, '', input_string, flags=re.MULTILINE)
3020
- return cleaned_string
3021
- cleaned_string = remove_img_tags(final_html)
3022
- logging.info(f"ReportGenerator: Final HTML for PDF conversion:\n{cleaned_string}")
3023
-
3024
- # Crafting compliance
3025
- final_html = final_html.replace("<h1","<br><br><br><h1").replace("</h1>","</h1><br>")
3026
- final_html = final_html.replace("<h2","<br><br><b><h2").replace("</h2>","</b></h2><br>")
3027
- final_html = final_html.replace("<h3","<br><br><h3").replace("</h3>","</b></h3><br>")
3028
- final_html = final_html.replace("<h4","<br><h4")
3029
- final_html = final_html.replace("<div","<br><div")
3030
- final_html = final_html.replace("<table>","<br><table>")
3031
-
3032
- # Generate the final PDF from final_html using xhtml2pdf (A4 layout)
3033
  pdf_buffer = io.BytesIO()
3034
  pisa_status = pisa.CreatePDF(final_html, dest=pdf_buffer,
3035
- link_callback=lambda uri, rel: uri,
3036
- default_css="""
3037
- @page {
3038
- size: A4;
3039
- margin: 0.5in;
3040
- }
3041
- body {
3042
- font-family: Helvetica, sans-serif;
3043
- background: white;
3044
- margin: 40px;
3045
- padding: 0;
3046
- }
3047
- h1 {
3048
- font-size: 20pt;
3049
- margin-bottom: 12px;
3050
- text-align: left;
3051
- font-weight: bold;
3052
- }
3053
- h2 {
3054
- font-size: 16pt;
3055
- margin-bottom: 10px;
3056
- text-align: left;
3057
- font-weight: bold;
3058
- }
3059
- h3 {
3060
- font-size: 14pt;
3061
- margin-bottom: 8px;
3062
- text-align: left;
3063
- font-weight: bold;
3064
- }
3065
- h4 {
3066
- font-size: 12pt;
3067
- text-align: left;
3068
- font-weight: bold;
3069
- }
3070
- table {
3071
- border: 1px solid black;
3072
- }
3073
- p {
3074
- font-size: 11pt;
3075
- line-height: 1.5;
3076
- margin-bottom: 10px;
3077
- }
3078
- pre, div {
3079
- }
3080
- ol, ul {
3081
- font-size: 11pt;
3082
- margin-left: 20px;
3083
- line-height: 1.5;
3084
- }
3085
- hr {
3086
- border: 1px solid #ccc;
3087
- margin: 20px 0;
3088
- }
3089
- table {
3090
- border-collapse: collapse;
3091
- width: 100%;
3092
- margin-bottom: 10px;
3093
- }
3094
- th, td {
3095
- border: 1px solid #ccc;
3096
- padding: 8px;
3097
- text-align: left;
3098
- }
3099
- th {
3100
- background-color: #f2f2f2;
3101
- }
3102
- .page-break {
3103
- page-break-before: always;
3104
- }
3105
- """)
3106
  if pisa_status.err:
3107
  logging.error("Error generating PDF with xhtml2pdf.")
3108
  return None
 
3109
  return pdf_buffer.getvalue()
3110
 
3111
  def handle_generate_report(query_name: str, user_name: str, final_report: str):
 
21
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
22
  # ============================================================================= Helper functions
23
 
24
+ def capture_visual_screenshot(srcdoc: str) -> str:
25
+ """
26
+ Opens a temporary HTML file from the provided srcdoc string,
27
+ loads it in a headless Chrome browser using Selenium,
28
+ waits for the content to render,
29
+ takes a screenshot, crops it, and returns a base64-encoded PNG image.
30
+ """
31
+ options = Options()
32
+ options.add_argument("--headless")
33
+ options.add_argument("--no-sandbox")
34
+ options.add_argument("--disable-dev-shm-usage")
35
+ driver = None
36
+ try:
37
+ driver = webdriver.Chrome(options=options)
38
+ driver.set_window_size(1080, 720) # Adjust per your expected visual dimensions
39
+
40
+ # Write the srcdoc to a temporary HTML file
41
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_file:
42
+ tmp_file.write(srcdoc.encode("utf-8"))
43
+ tmp_file.flush()
44
+ file_url = "file://" + tmp_file.name
45
+
46
+ driver.get(file_url)
47
+ time.sleep(3) # Allow time for JavaScript (e.g., mermaid) to render
48
+
49
+ screenshot_png = driver.get_screenshot_as_png()
50
+ image = Image.open(BytesIO(screenshot_png))
51
+ # Crop the image to remove extra whitespace
52
+ cropped_image = image.crop(image.getbbox())
53
+ buffered = BytesIO()
54
+ cropped_image.save(buffered, format='PNG')
55
+ base64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
56
+ return base64_img
57
+ except WebDriverException as e:
58
+ logging.error("Error in capture_visual_screenshot: %s", e)
59
+ return ""
60
+ finally:
61
+ if driver:
62
+ driver.quit()
63
+
64
+ def replace_visual_iframes(soup: BeautifulSoup) -> BeautifulSoup:
65
+ """
66
+ Finds all <iframe class="visual-frame"> tags in the provided BeautifulSoup object,
67
+ uses capture_visual_screenshot() to get a base64 image,
68
+ and replaces each iframe with an <img> tag embedding the screenshot.
69
+ """
70
+ iframes = soup.find_all("iframe", class_="visual-frame")
71
+ for iframe in iframes:
72
+ srcdoc = iframe.get("srcdoc")
73
+ if srcdoc:
74
+ base64_img = capture_visual_screenshot(srcdoc)
75
+ if base64_img:
76
+ new_img = soup.new_tag("img")
77
+ new_img["src"] = "data:image/png;base64," + base64_img
78
+ new_img["style"] = "max-width:100%; display:block; margin:auto; page-break-after:avoid;"
79
+ iframe.replace_with(new_img)
80
+ else:
81
+ logging.error("Failed to capture screenshot for an iframe.")
82
+ return soup
83
+
84
  def process_pdf(url: str) -> str:
85
  try:
86
  headers = {"User-Agent": get_random_header()}
 
2961
  solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
2962
  solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
2963
  html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
2964
+ # Insert page break divs before key sections
2965
  html_content = html_content.replace("<h2>Table of Contents</h2>", "<div class='page-break'></div><h2>Table of Contents</h2>")
2966
  html_content = html_content.replace("<h2>Introduction</h2>", "<div class='page-break'></div><h2>Introduction</h2>")
2967
  html_content = html_content.replace("<h2>Conclusion</h2>", "<div class='page-break'></div><h2>Conclusion</h2>")
 
2972
  return html_content
2973
 
2974
  def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
2975
+ # Generate the full HTML report (including text, placeholders, and mermaid visuals as iframes)
2976
  html_report = self.generate_report_html(solution_content)
2977
 
2978
+ # Add header if provided in metadata
2979
  date_str = datetime.now().strftime("%Y-%m-%d")
2980
  header = ""
2981
  if metadata:
2982
+ header = (f"<p>Search Query: {metadata.get('Query name', 'N/A')}<br>"
2983
+ f"Author: {metadata.get('User name', 'N/A')} | Date: {metadata.get('Date', date_str)}</p>")
2984
  soup = BeautifulSoup(html_report, "html.parser")
2985
+ if soup.body:
2986
+ soup.body.insert(0, BeautifulSoup(header, "html.parser"))
2987
+ logging.info("ReportGenerator: Soup report generated:\n%s", soup)
2988
+
2989
+ # Replace all iframes (class 'visual-frame') with images
2990
+ soup = replace_visual_iframes(soup)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2991
 
2992
+ # Extract only the body content if <body> exists, otherwise use full HTML
2993
  body_tag = soup.find("body")
2994
+ body_content = body_tag.decode_contents() if body_tag else str(soup)
2995
+
2996
+ # Reassemble a clean HTML document with inline CSS styles for PDF conversion.
2997
+ final_html = f"""<!DOCTYPE html>
2998
  <html>
2999
  <head>
3000
  <meta charset="utf-8" />
 
3021
  </body>
3022
  </html>
3023
  """
3024
+ logging.info("ReportGenerator: Final HTML for PDF conversion generated.")
3025
+
3026
+ # Generate the PDF using xhtml2pdf (pisa)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3027
  pdf_buffer = io.BytesIO()
3028
  pisa_status = pisa.CreatePDF(final_html, dest=pdf_buffer,
3029
+ link_callback=lambda uri, rel: uri)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3030
  if pisa_status.err:
3031
  logging.error("Error generating PDF with xhtml2pdf.")
3032
  return None
3033
+ logging.info("ReportGenerator: PDF generated successfully.")
3034
  return pdf_buffer.getvalue()
3035
 
3036
  def handle_generate_report(query_name: str, user_name: str, final_report: str):