Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -21,6 +21,66 @@ TOTAL_SUMMARIZED_WORDS = 0
|
|
| 21 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 22 |
# ============================================================================= Helper functions
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def process_pdf(url: str) -> str:
|
| 25 |
try:
|
| 26 |
headers = {"User-Agent": get_random_header()}
|
|
@@ -2901,6 +2961,7 @@ class ReportGenerator:
|
|
| 2901 |
solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
|
| 2902 |
solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
|
| 2903 |
html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
|
|
|
|
| 2904 |
html_content = html_content.replace("<h2>Table of Contents</h2>", "<div class='page-break'></div><h2>Table of Contents</h2>")
|
| 2905 |
html_content = html_content.replace("<h2>Introduction</h2>", "<div class='page-break'></div><h2>Introduction</h2>")
|
| 2906 |
html_content = html_content.replace("<h2>Conclusion</h2>", "<div class='page-break'></div><h2>Conclusion</h2>")
|
|
@@ -2911,80 +2972,29 @@ class ReportGenerator:
|
|
| 2911 |
return html_content
|
| 2912 |
|
| 2913 |
def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
|
| 2914 |
-
# Generate the full HTML report (including text,
|
| 2915 |
html_report = self.generate_report_html(solution_content)
|
| 2916 |
|
| 2917 |
-
# Add header
|
| 2918 |
date_str = datetime.now().strftime("%Y-%m-%d")
|
| 2919 |
header = ""
|
| 2920 |
if metadata:
|
| 2921 |
-
header = f"<p>Search Query: {metadata.get('Query name', 'N/A')}<br>
|
|
|
|
| 2922 |
soup = BeautifulSoup(html_report, "html.parser")
|
| 2923 |
-
|
| 2924 |
-
|
| 2925 |
-
|
| 2926 |
-
|
| 2927 |
-
|
| 2928 |
-
|
| 2929 |
-
logging.info(f"ReportGenerator: soup report generated:\n{soup}")
|
| 2930 |
-
|
| 2931 |
-
# Find all mermaid visual iframes (assumed to have class "visual-frame")
|
| 2932 |
-
visual_iframes = soup.find_all("iframe", class_="visual-frame")
|
| 2933 |
-
|
| 2934 |
-
if visual_iframes:
|
| 2935 |
-
# Set up Selenium with a window size and high DPI for better image resolution
|
| 2936 |
-
import base64, tempfile, time
|
| 2937 |
-
import chromedriver_autoinstaller
|
| 2938 |
-
chromedriver_autoinstaller.install()
|
| 2939 |
-
# (Removed the explicit print statement to keep logs clean)
|
| 2940 |
-
from selenium import webdriver
|
| 2941 |
-
from selenium.webdriver.chrome.options import Options
|
| 2942 |
-
from selenium.webdriver.chrome.service import Service
|
| 2943 |
-
options = Options()
|
| 2944 |
-
options.add_argument("--headless")
|
| 2945 |
-
options.add_argument("--no-sandbox")
|
| 2946 |
-
options.add_argument("--disable-dev-shm-usage")
|
| 2947 |
-
options.add_argument("--window-size=1200,1200")
|
| 2948 |
-
options.add_argument("--force-device-scale-factor=2")
|
| 2949 |
-
service = Service(log_path=os.devnull)
|
| 2950 |
-
driver = webdriver.Chrome(service=service, options=options)
|
| 2951 |
-
|
| 2952 |
-
for iframe in visual_iframes:
|
| 2953 |
-
# Assume the iframe has its content in srcdoc (as generated in generate_visual_snippet)
|
| 2954 |
-
srcdoc = iframe.get("srcdoc")
|
| 2955 |
-
if srcdoc:
|
| 2956 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_file:
|
| 2957 |
-
tmp_file.write(srcdoc.encode("utf-8"))
|
| 2958 |
-
tmp_file.flush()
|
| 2959 |
-
file_url = "file://" + tmp_file.name
|
| 2960 |
-
|
| 2961 |
-
driver.get(file_url)
|
| 2962 |
-
time.sleep(3) # Allow time for JavaScript (e.g., mermaid) to render
|
| 2963 |
-
screenshot_png = driver.get_screenshot_as_png()
|
| 2964 |
-
|
| 2965 |
-
# Optional: Crop the screenshot to remove extra whitespace:
|
| 2966 |
-
from PIL import Image
|
| 2967 |
-
from io import BytesIO
|
| 2968 |
-
img = Image.open(BytesIO(screenshot_png))
|
| 2969 |
-
cropped_img = img.crop(img.getbbox())
|
| 2970 |
-
buffer = BytesIO()
|
| 2971 |
-
cropped_img.save(buffer, format='PNG')
|
| 2972 |
-
cropped_png = buffer.getvalue()
|
| 2973 |
-
|
| 2974 |
-
b64_img = base64.b64encode(cropped_png).decode("utf-8")
|
| 2975 |
-
new_tag = soup.new_tag("img")
|
| 2976 |
-
# Add page-break style to avoid forcing a new page after the image when generating the PDF.
|
| 2977 |
-
new_tag["style"] = "max-width: 500px; display: block; margin: auto; page-break-after: avoid;"
|
| 2978 |
-
new_tag["src"] = "data:image/png;base64," + b64_img
|
| 2979 |
-
iframe.replace_with(new_tag)
|
| 2980 |
-
driver.quit()
|
| 2981 |
|
| 2982 |
-
#
|
| 2983 |
body_tag = soup.find("body")
|
| 2984 |
-
body_content = body_tag.decode_contents() if body_tag else
|
| 2985 |
-
|
| 2986 |
-
# Reassemble a clean
|
| 2987 |
-
final_html = f"""
|
| 2988 |
<html>
|
| 2989 |
<head>
|
| 2990 |
<meta charset="utf-8" />
|
|
@@ -3011,101 +3021,16 @@ class ReportGenerator:
|
|
| 3011 |
</body>
|
| 3012 |
</html>
|
| 3013 |
"""
|
| 3014 |
-
|
| 3015 |
-
|
| 3016 |
-
|
| 3017 |
-
pattern = r'<img src=.*?>'
|
| 3018 |
-
# Replace all occurrences with an empty string
|
| 3019 |
-
cleaned_string = re.sub(pattern, '', input_string, flags=re.MULTILINE)
|
| 3020 |
-
return cleaned_string
|
| 3021 |
-
cleaned_string = remove_img_tags(final_html)
|
| 3022 |
-
logging.info(f"ReportGenerator: Final HTML for PDF conversion:\n{cleaned_string}")
|
| 3023 |
-
|
| 3024 |
-
# Crafting compliance
|
| 3025 |
-
final_html = final_html.replace("<h1","<br><br><br><h1").replace("</h1>","</h1><br>")
|
| 3026 |
-
final_html = final_html.replace("<h2","<br><br><b><h2").replace("</h2>","</b></h2><br>")
|
| 3027 |
-
final_html = final_html.replace("<h3","<br><br><h3").replace("</h3>","</b></h3><br>")
|
| 3028 |
-
final_html = final_html.replace("<h4","<br><h4")
|
| 3029 |
-
final_html = final_html.replace("<div","<br><div")
|
| 3030 |
-
final_html = final_html.replace("<table>","<br><table>")
|
| 3031 |
-
|
| 3032 |
-
# Generate the final PDF from final_html using xhtml2pdf (A4 layout)
|
| 3033 |
pdf_buffer = io.BytesIO()
|
| 3034 |
pisa_status = pisa.CreatePDF(final_html, dest=pdf_buffer,
|
| 3035 |
-
link_callback=lambda uri, rel: uri
|
| 3036 |
-
default_css="""
|
| 3037 |
-
@page {
|
| 3038 |
-
size: A4;
|
| 3039 |
-
margin: 0.5in;
|
| 3040 |
-
}
|
| 3041 |
-
body {
|
| 3042 |
-
font-family: Helvetica, sans-serif;
|
| 3043 |
-
background: white;
|
| 3044 |
-
margin: 40px;
|
| 3045 |
-
padding: 0;
|
| 3046 |
-
}
|
| 3047 |
-
h1 {
|
| 3048 |
-
font-size: 20pt;
|
| 3049 |
-
margin-bottom: 12px;
|
| 3050 |
-
text-align: left;
|
| 3051 |
-
font-weight: bold;
|
| 3052 |
-
}
|
| 3053 |
-
h2 {
|
| 3054 |
-
font-size: 16pt;
|
| 3055 |
-
margin-bottom: 10px;
|
| 3056 |
-
text-align: left;
|
| 3057 |
-
font-weight: bold;
|
| 3058 |
-
}
|
| 3059 |
-
h3 {
|
| 3060 |
-
font-size: 14pt;
|
| 3061 |
-
margin-bottom: 8px;
|
| 3062 |
-
text-align: left;
|
| 3063 |
-
font-weight: bold;
|
| 3064 |
-
}
|
| 3065 |
-
h4 {
|
| 3066 |
-
font-size: 12pt;
|
| 3067 |
-
text-align: left;
|
| 3068 |
-
font-weight: bold;
|
| 3069 |
-
}
|
| 3070 |
-
table {
|
| 3071 |
-
border: 1px solid black;
|
| 3072 |
-
}
|
| 3073 |
-
p {
|
| 3074 |
-
font-size: 11pt;
|
| 3075 |
-
line-height: 1.5;
|
| 3076 |
-
margin-bottom: 10px;
|
| 3077 |
-
}
|
| 3078 |
-
pre, div {
|
| 3079 |
-
}
|
| 3080 |
-
ol, ul {
|
| 3081 |
-
font-size: 11pt;
|
| 3082 |
-
margin-left: 20px;
|
| 3083 |
-
line-height: 1.5;
|
| 3084 |
-
}
|
| 3085 |
-
hr {
|
| 3086 |
-
border: 1px solid #ccc;
|
| 3087 |
-
margin: 20px 0;
|
| 3088 |
-
}
|
| 3089 |
-
table {
|
| 3090 |
-
border-collapse: collapse;
|
| 3091 |
-
width: 100%;
|
| 3092 |
-
margin-bottom: 10px;
|
| 3093 |
-
}
|
| 3094 |
-
th, td {
|
| 3095 |
-
border: 1px solid #ccc;
|
| 3096 |
-
padding: 8px;
|
| 3097 |
-
text-align: left;
|
| 3098 |
-
}
|
| 3099 |
-
th {
|
| 3100 |
-
background-color: #f2f2f2;
|
| 3101 |
-
}
|
| 3102 |
-
.page-break {
|
| 3103 |
-
page-break-before: always;
|
| 3104 |
-
}
|
| 3105 |
-
""")
|
| 3106 |
if pisa_status.err:
|
| 3107 |
logging.error("Error generating PDF with xhtml2pdf.")
|
| 3108 |
return None
|
|
|
|
| 3109 |
return pdf_buffer.getvalue()
|
| 3110 |
|
| 3111 |
def handle_generate_report(query_name: str, user_name: str, final_report: str):
|
|
|
|
| 21 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 22 |
# ============================================================================= Helper functions
|
| 23 |
|
| 24 |
+
def capture_visual_screenshot(srcdoc: str) -> str:
|
| 25 |
+
"""
|
| 26 |
+
Opens a temporary HTML file from the provided srcdoc string,
|
| 27 |
+
loads it in a headless Chrome browser using Selenium,
|
| 28 |
+
waits for the content to render,
|
| 29 |
+
takes a screenshot, crops it, and returns a base64-encoded PNG image.
|
| 30 |
+
"""
|
| 31 |
+
options = Options()
|
| 32 |
+
options.add_argument("--headless")
|
| 33 |
+
options.add_argument("--no-sandbox")
|
| 34 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 35 |
+
driver = None
|
| 36 |
+
try:
|
| 37 |
+
driver = webdriver.Chrome(options=options)
|
| 38 |
+
driver.set_window_size(1080, 720) # Adjust per your expected visual dimensions
|
| 39 |
+
|
| 40 |
+
# Write the srcdoc to a temporary HTML file
|
| 41 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_file:
|
| 42 |
+
tmp_file.write(srcdoc.encode("utf-8"))
|
| 43 |
+
tmp_file.flush()
|
| 44 |
+
file_url = "file://" + tmp_file.name
|
| 45 |
+
|
| 46 |
+
driver.get(file_url)
|
| 47 |
+
time.sleep(3) # Allow time for JavaScript (e.g., mermaid) to render
|
| 48 |
+
|
| 49 |
+
screenshot_png = driver.get_screenshot_as_png()
|
| 50 |
+
image = Image.open(BytesIO(screenshot_png))
|
| 51 |
+
# Crop the image to remove extra whitespace
|
| 52 |
+
cropped_image = image.crop(image.getbbox())
|
| 53 |
+
buffered = BytesIO()
|
| 54 |
+
cropped_image.save(buffered, format='PNG')
|
| 55 |
+
base64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 56 |
+
return base64_img
|
| 57 |
+
except WebDriverException as e:
|
| 58 |
+
logging.error("Error in capture_visual_screenshot: %s", e)
|
| 59 |
+
return ""
|
| 60 |
+
finally:
|
| 61 |
+
if driver:
|
| 62 |
+
driver.quit()
|
| 63 |
+
|
| 64 |
+
def replace_visual_iframes(soup: BeautifulSoup) -> BeautifulSoup:
|
| 65 |
+
"""
|
| 66 |
+
Finds all <iframe class="visual-frame"> tags in the provided BeautifulSoup object,
|
| 67 |
+
uses capture_visual_screenshot() to get a base64 image,
|
| 68 |
+
and replaces each iframe with an <img> tag embedding the screenshot.
|
| 69 |
+
"""
|
| 70 |
+
iframes = soup.find_all("iframe", class_="visual-frame")
|
| 71 |
+
for iframe in iframes:
|
| 72 |
+
srcdoc = iframe.get("srcdoc")
|
| 73 |
+
if srcdoc:
|
| 74 |
+
base64_img = capture_visual_screenshot(srcdoc)
|
| 75 |
+
if base64_img:
|
| 76 |
+
new_img = soup.new_tag("img")
|
| 77 |
+
new_img["src"] = "data:image/png;base64," + base64_img
|
| 78 |
+
new_img["style"] = "max-width:100%; display:block; margin:auto; page-break-after:avoid;"
|
| 79 |
+
iframe.replace_with(new_img)
|
| 80 |
+
else:
|
| 81 |
+
logging.error("Failed to capture screenshot for an iframe.")
|
| 82 |
+
return soup
|
| 83 |
+
|
| 84 |
def process_pdf(url: str) -> str:
|
| 85 |
try:
|
| 86 |
headers = {"User-Agent": get_random_header()}
|
|
|
|
| 2961 |
solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
|
| 2962 |
solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
|
| 2963 |
html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
|
| 2964 |
+
# Insert page break divs before key sections
|
| 2965 |
html_content = html_content.replace("<h2>Table of Contents</h2>", "<div class='page-break'></div><h2>Table of Contents</h2>")
|
| 2966 |
html_content = html_content.replace("<h2>Introduction</h2>", "<div class='page-break'></div><h2>Introduction</h2>")
|
| 2967 |
html_content = html_content.replace("<h2>Conclusion</h2>", "<div class='page-break'></div><h2>Conclusion</h2>")
|
|
|
|
| 2972 |
return html_content
|
| 2973 |
|
| 2974 |
def generate_report_pdf(self, solution_content: str, metadata: dict = None) -> bytes:
|
| 2975 |
+
# Generate the full HTML report (including text, placeholders, and mermaid visuals as iframes)
|
| 2976 |
html_report = self.generate_report_html(solution_content)
|
| 2977 |
|
| 2978 |
+
# Add header if provided in metadata
|
| 2979 |
date_str = datetime.now().strftime("%Y-%m-%d")
|
| 2980 |
header = ""
|
| 2981 |
if metadata:
|
| 2982 |
+
header = (f"<p>Search Query: {metadata.get('Query name', 'N/A')}<br>"
|
| 2983 |
+
f"Author: {metadata.get('User name', 'N/A')} | Date: {metadata.get('Date', date_str)}</p>")
|
| 2984 |
soup = BeautifulSoup(html_report, "html.parser")
|
| 2985 |
+
if soup.body:
|
| 2986 |
+
soup.body.insert(0, BeautifulSoup(header, "html.parser"))
|
| 2987 |
+
logging.info("ReportGenerator: Soup report generated:\n%s", soup)
|
| 2988 |
+
|
| 2989 |
+
# Replace all iframes (class 'visual-frame') with images
|
| 2990 |
+
soup = replace_visual_iframes(soup)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2991 |
|
| 2992 |
+
# Extract only the body content if <body> exists, otherwise use full HTML
|
| 2993 |
body_tag = soup.find("body")
|
| 2994 |
+
body_content = body_tag.decode_contents() if body_tag else str(soup)
|
| 2995 |
+
|
| 2996 |
+
# Reassemble a clean HTML document with inline CSS styles for PDF conversion.
|
| 2997 |
+
final_html = f"""<!DOCTYPE html>
|
| 2998 |
<html>
|
| 2999 |
<head>
|
| 3000 |
<meta charset="utf-8" />
|
|
|
|
| 3021 |
</body>
|
| 3022 |
</html>
|
| 3023 |
"""
|
| 3024 |
+
logging.info("ReportGenerator: Final HTML for PDF conversion generated.")
|
| 3025 |
+
|
| 3026 |
+
# Generate the PDF using xhtml2pdf (pisa)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3027 |
pdf_buffer = io.BytesIO()
|
| 3028 |
pisa_status = pisa.CreatePDF(final_html, dest=pdf_buffer,
|
| 3029 |
+
link_callback=lambda uri, rel: uri)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3030 |
if pisa_status.err:
|
| 3031 |
logging.error("Error generating PDF with xhtml2pdf.")
|
| 3032 |
return None
|
| 3033 |
+
logging.info("ReportGenerator: PDF generated successfully.")
|
| 3034 |
return pdf_buffer.getvalue()
|
| 3035 |
|
| 3036 |
def handle_generate_report(query_name: str, user_name: str, final_report: str):
|