Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,299 +2,309 @@ import dash
|
|
| 2 |
from dash import dcc, html, Input, Output, State
|
| 3 |
import dash_bootstrap_components as dbc
|
| 4 |
from dash.exceptions import PreventUpdate
|
| 5 |
-
import
|
| 6 |
-
from github import Github
|
| 7 |
-
import gitlab
|
| 8 |
import requests
|
| 9 |
-
import
|
| 10 |
-
import
|
| 11 |
-
import
|
|
|
|
| 12 |
import logging
|
| 13 |
-
import
|
| 14 |
-
|
| 15 |
-
from
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Initialize Dash app
|
| 22 |
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
| 23 |
-
server = app.server
|
| 24 |
-
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
1. Any configuration options or settings related to this UI component
|
| 114 |
-
2. Security considerations or access control related to this feature
|
| 115 |
-
3. How to monitor or troubleshoot issues with this component
|
| 116 |
-
4. Best practices for managing and maintaining this part of the system
|
| 117 |
-
|
| 118 |
-
Important formatting instructions:
|
| 119 |
-
- The output should be in plain text no markdown for example for example do not use * or ** or # or ##. Instead use numbers like 1., 2. for bullets
|
| 120 |
-
- Use clear section titles
|
| 121 |
-
- Use clear section titles that has the name of the file in parenthesis
|
| 122 |
-
- Follow this numbering heirarchy (1.0, 1.1, 1.2), (2.0, 2.1, 2.2), (3.0, 3.1, 3.2)
|
| 123 |
-
- Explain the purpose and implications of each component
|
| 124 |
-
"""
|
| 125 |
-
|
| 126 |
-
response = model.generate_content(prompt)
|
| 127 |
-
logger.info(f"Generated {guide_type} section for {file_path}")
|
| 128 |
-
return response.text
|
| 129 |
-
|
| 130 |
-
def generate_guide(git_provider, repo_url, guide_type, exclude_folders):
|
| 131 |
try:
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
guide_sections = []
|
| 136 |
-
for file_path, content in file_contents:
|
| 137 |
-
section = generate_guide_section(file_path, content, guide_type)
|
| 138 |
-
guide_sections.append(section)
|
| 139 |
-
logger.info(f"Added section for {file_path}")
|
| 140 |
-
|
| 141 |
-
full_guide = f"# {guide_type}\n\n" + "\n\n".join(guide_sections)
|
| 142 |
-
|
| 143 |
-
logger.info("Creating DOCX file")
|
| 144 |
-
doc = docx.Document()
|
| 145 |
-
doc.add_heading(guide_type, 0)
|
| 146 |
-
|
| 147 |
-
for line in full_guide.split('\n'):
|
| 148 |
-
line = line.strip()
|
| 149 |
-
if line.startswith('# '):
|
| 150 |
-
doc.add_heading(line[2:], level=1)
|
| 151 |
-
elif line.startswith('## '):
|
| 152 |
-
doc.add_heading(line[3:], level=2)
|
| 153 |
-
elif line.startswith('Step'):
|
| 154 |
-
doc.add_paragraph(line, style='List Number')
|
| 155 |
-
else:
|
| 156 |
-
doc.add_paragraph(line)
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8') as temp_md:
|
| 165 |
-
temp_md.write(full_guide)
|
| 166 |
-
md_path = temp_md.name
|
| 167 |
-
logger.info(f"Markdown file saved: {md_path}")
|
| 168 |
-
|
| 169 |
-
logger.info("Guide generation completed successfully")
|
| 170 |
-
return full_guide, docx_path, md_path
|
| 171 |
-
|
| 172 |
except Exception as e:
|
| 173 |
-
logger.error(f"
|
| 174 |
-
return f"An error occurred: {str(e)}"
|
| 175 |
|
| 176 |
# App layout
|
| 177 |
app.layout = dbc.Container([
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
)
|
|
|
|
|
|
|
| 191 |
]),
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
),
|
| 195 |
-
|
| 196 |
-
dbc.Row([
|
| 197 |
-
dbc.Col([
|
| 198 |
-
html.H1("Automated Guide Generator", className="text-center my-4"),
|
| 199 |
-
html.P("Generate a user guide or administration guide based on the UI-related code in a Git repository using Gemini AI. Select a Git provider, enter repository details, choose the guide type, and let AI create a comprehensive guide.", className="text-center mb-4"),
|
| 200 |
-
|
| 201 |
-
dbc.Card([
|
| 202 |
-
dbc.CardBody([
|
| 203 |
-
dbc.Form([
|
| 204 |
-
dbc.Select(
|
| 205 |
-
id="git-provider",
|
| 206 |
-
options=[
|
| 207 |
-
{"label": "GitHub", "value": "GitHub"},
|
| 208 |
-
{"label": "GitLab", "value": "GitLab"},
|
| 209 |
-
{"label": "Gitea", "value": "Gitea"}
|
| 210 |
-
],
|
| 211 |
-
placeholder="Select Git Provider",
|
| 212 |
-
),
|
| 213 |
-
dbc.Input(id="repo-url", type="text", placeholder="Repository URL (owner/repo)"),
|
| 214 |
-
dbc.RadioItems(
|
| 215 |
-
id="guide-type",
|
| 216 |
-
options=[
|
| 217 |
-
{"label": "User Guide", "value": "User Guide"},
|
| 218 |
-
{"label": "Administration Guide", "value": "Administration Guide"}
|
| 219 |
-
],
|
| 220 |
-
inline=True,
|
| 221 |
-
),
|
| 222 |
-
dbc.Input(id="exclude-folders", type="text", placeholder="Exclude Folders (comma-separated)"),
|
| 223 |
-
dbc.Button("Generate Guide", id="generate-button", color="primary", className="mt-3"),
|
| 224 |
-
])
|
| 225 |
-
])
|
| 226 |
-
], className="mb-4"),
|
| 227 |
-
|
| 228 |
-
dbc.Spinner(
|
| 229 |
-
dbc.Card([
|
| 230 |
-
dbc.CardBody([
|
| 231 |
-
html.H4("Generated Guide", className="card-title"),
|
| 232 |
-
html.Div([
|
| 233 |
-
dbc.Button("Download DOCX", id="download-docx", color="secondary", className="me-2"),
|
| 234 |
-
dbc.Button("Download Markdown", id="download-md", color="secondary"),
|
| 235 |
-
], className="mt-3"),
|
| 236 |
-
dcc.Download(id="download-docx-file"),
|
| 237 |
-
dcc.Download(id="download-md-file"),
|
| 238 |
-
])
|
| 239 |
-
], className="mt-4"),
|
| 240 |
-
color="primary",
|
| 241 |
-
),
|
| 242 |
-
], width=6),
|
| 243 |
-
dbc.Col([
|
| 244 |
-
dbc.Card([
|
| 245 |
-
dbc.CardBody([
|
| 246 |
-
html.H4("Preview", className="card-title"),
|
| 247 |
-
html.Div(id="generated-guide", style={"whiteSpace": "pre-wrap", "height": "400px", "overflowY": "auto"}),
|
| 248 |
-
])
|
| 249 |
-
], className="mt-4"),
|
| 250 |
-
], width=6),
|
| 251 |
-
])
|
| 252 |
], fluid=True)
|
| 253 |
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
Output("download-md", "n_clicks")],
|
| 258 |
-
[Input("generate-button", "n_clicks")],
|
| 259 |
-
[State("git-provider", "value"),
|
| 260 |
-
State("repo-url", "value"),
|
| 261 |
-
State("guide-type", "value"),
|
| 262 |
-
State("exclude-folders", "value")]
|
| 263 |
-
)
|
| 264 |
-
def update_output(n_clicks, git_provider, repo_url, guide_type, exclude_folders):
|
| 265 |
-
if n_clicks is None:
|
| 266 |
raise PreventUpdate
|
| 267 |
-
|
| 268 |
-
def generate_guide_thread():
|
| 269 |
-
nonlocal guide_text, docx_path, md_path
|
| 270 |
-
guide_text, docx_path, md_path = generate_guide(git_provider, repo_url, guide_type, exclude_folders)
|
| 271 |
-
|
| 272 |
-
guide_text, docx_path, md_path = None, None, None
|
| 273 |
-
thread = threading.Thread(target=generate_guide_thread)
|
| 274 |
-
thread.start()
|
| 275 |
-
thread.join()
|
| 276 |
-
|
| 277 |
-
return guide_text, 0, 0 # Reset n_clicks for download buttons
|
| 278 |
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
|
| 289 |
@app.callback(
|
| 290 |
-
Output("download-
|
| 291 |
-
Input("download-
|
| 292 |
-
|
|
|
|
| 293 |
)
|
| 294 |
-
def
|
| 295 |
-
if
|
| 296 |
raise PreventUpdate
|
| 297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
|
| 299 |
if __name__ == '__main__':
|
| 300 |
print("Starting the Dash application...")
|
|
|
|
| 2 |
from dash import dcc, html, Input, Output, State
|
| 3 |
import dash_bootstrap_components as dbc
|
| 4 |
from dash.exceptions import PreventUpdate
|
| 5 |
+
import base64
|
|
|
|
|
|
|
| 6 |
import requests
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
from urllib.parse import urljoin, urlparse
|
| 9 |
+
from fpdf import FPDF
|
| 10 |
+
import re
|
| 11 |
import logging
|
| 12 |
+
import asyncio
|
| 13 |
+
import aiohttp
|
| 14 |
+
from aiolimiter import AsyncLimiter
|
| 15 |
+
import sqlite3
|
| 16 |
+
from contextlib import contextmanager
|
| 17 |
+
from threading import local
|
| 18 |
+
import time
|
| 19 |
+
import os
|
| 20 |
+
import ssl
|
| 21 |
+
from io import BytesIO
|
| 22 |
+
import tempfile
|
| 23 |
+
import uuid
|
| 24 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 25 |
+
from PyPDF2 import PdfMerger
|
| 26 |
|
| 27 |
# Initialize Dash app
|
| 28 |
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
| 29 |
+
server = app.server
|
| 30 |
+
|
| 31 |
+
# Logging setup
|
| 32 |
+
logging.basicConfig(level=logging.INFO)
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
|
| 35 |
+
# Thread-local storage for database connections
|
| 36 |
+
thread_local = local()
|
| 37 |
+
|
| 38 |
+
# Rate limiter: 10 requests per second
|
| 39 |
+
rate_limiter = AsyncLimiter(10, 1)
|
| 40 |
+
|
| 41 |
+
# Create an SSL context that ignores certificate verification
|
| 42 |
+
ssl_context = ssl.create_default_context()
|
| 43 |
+
ssl_context.check_hostname = False
|
| 44 |
+
ssl_context.verify_mode = ssl.CERT_NONE
|
| 45 |
+
|
| 46 |
+
# ThreadPoolExecutor for background tasks
|
| 47 |
+
executor = ThreadPoolExecutor(max_workers=4)
|
| 48 |
+
|
| 49 |
+
@contextmanager
|
| 50 |
+
def get_db_connection():
|
| 51 |
+
if not hasattr(thread_local, "connection"):
|
| 52 |
+
thread_local.connection = sqlite3.connect('crawl_cache.db')
|
| 53 |
+
try:
|
| 54 |
+
yield thread_local.connection
|
| 55 |
+
finally:
|
| 56 |
+
pass # We'll keep the connection open for reuse
|
| 57 |
+
|
| 58 |
+
def init_db():
|
| 59 |
+
with get_db_connection() as conn:
|
| 60 |
+
c = conn.cursor()
|
| 61 |
+
c.execute('''CREATE TABLE IF NOT EXISTS pages
|
| 62 |
+
(url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
|
| 63 |
+
c.execute('''CREATE INDEX IF NOT EXISTS idx_url ON pages(url)''')
|
| 64 |
+
conn.commit()
|
| 65 |
+
|
| 66 |
+
init_db()
|
| 67 |
+
|
| 68 |
+
def clean_text(text):
|
| 69 |
+
text = ''.join(char for char in text if char.isprintable())
|
| 70 |
+
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
|
| 71 |
+
return text
|
| 72 |
+
|
| 73 |
+
async def get_page_content(session, url):
|
| 74 |
+
try:
|
| 75 |
+
async with rate_limiter:
|
| 76 |
+
async with session.get(url, timeout=30) as response:
|
| 77 |
+
if response.status == 200:
|
| 78 |
+
text = await response.text()
|
| 79 |
+
soup = BeautifulSoup(text, 'html.parser')
|
| 80 |
+
content = []
|
| 81 |
+
main_content = soup.find('article') or soup.find('main') or soup
|
| 82 |
+
if main_content:
|
| 83 |
+
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
|
| 84 |
+
for element in main_content.find_all(tag):
|
| 85 |
+
text = clean_text(element.get_text(strip=True))
|
| 86 |
+
if text:
|
| 87 |
+
content.append(text)
|
| 88 |
+
logger.info(f"Found {len(content)} content items for {url}")
|
| 89 |
+
return content
|
| 90 |
+
else:
|
| 91 |
+
logger.error(f"Error fetching {url}: HTTP {response.status}")
|
| 92 |
+
return [f"Error fetching {url}: HTTP {response.status}"]
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.error(f"Error processing {url}: {str(e)}")
|
| 95 |
+
return [f"Error processing {url}: {str(e)}"]
|
| 96 |
+
|
| 97 |
+
async def get_links(session, url, base_url):
|
| 98 |
+
try:
|
| 99 |
+
async with rate_limiter:
|
| 100 |
+
async with session.get(url, timeout=30) as response:
|
| 101 |
+
if response.status == 200:
|
| 102 |
+
text = await response.text()
|
| 103 |
+
soup = BeautifulSoup(text, 'html.parser')
|
| 104 |
+
links = soup.find_all('a', href=True)
|
| 105 |
+
valid_links = []
|
| 106 |
+
for link in links:
|
| 107 |
+
full_url = urljoin(url, link['href'])
|
| 108 |
+
if full_url.startswith(base_url) and full_url != url:
|
| 109 |
+
valid_links.append(full_url)
|
| 110 |
+
return valid_links
|
| 111 |
+
else:
|
| 112 |
+
logger.error(f"Error fetching links from {url}: HTTP {response.status}")
|
| 113 |
+
return []
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.error(f"Error getting links from {url}: {str(e)}")
|
| 116 |
+
return []
|
| 117 |
+
|
| 118 |
+
async def crawl_pages(base_url, max_depth):
|
| 119 |
+
visited = set()
|
| 120 |
+
to_visit = [(base_url, 0)]
|
| 121 |
+
all_pages = []
|
| 122 |
+
|
| 123 |
+
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl_context)) as session:
|
| 124 |
+
while to_visit:
|
| 125 |
+
current_url, depth = to_visit.pop(0)
|
| 126 |
+
if current_url in visited or depth > max_depth:
|
| 127 |
+
continue
|
| 128 |
+
|
| 129 |
+
visited.add(current_url)
|
| 130 |
+
start_time = time.time()
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
with get_db_connection() as conn:
|
| 134 |
+
c = conn.cursor()
|
| 135 |
+
c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
|
| 136 |
+
result = c.fetchone()
|
| 137 |
+
|
| 138 |
+
if result:
|
| 139 |
+
content = eval(result[0]) # Convert string back to list
|
| 140 |
+
else:
|
| 141 |
+
content = await get_page_content(session, current_url)
|
| 142 |
+
with get_db_connection() as conn:
|
| 143 |
+
c = conn.cursor()
|
| 144 |
+
c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
|
| 145 |
+
conn.commit()
|
| 146 |
+
|
| 147 |
+
all_pages.append((current_url, content))
|
| 148 |
+
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
| 149 |
+
|
| 150 |
+
if depth < max_depth:
|
| 151 |
+
links = await get_links(session, current_url, base_url)
|
| 152 |
+
for link in links:
|
| 153 |
+
if link not in visited:
|
| 154 |
+
to_visit.append((link, depth + 1))
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.error(f"Error processing {current_url}: {str(e)}")
|
| 157 |
+
# Continue with the next URL even if this one fails
|
| 158 |
+
|
| 159 |
+
return all_pages
|
| 160 |
+
|
| 161 |
+
def generate_pdf_chunk(chunk, output_file):
|
| 162 |
+
pdf = FPDF()
|
| 163 |
+
pdf.set_auto_page_break(auto=True, margin=15)
|
| 164 |
+
pdf.add_page()
|
| 165 |
+
pdf.set_font("Arial", size=12)
|
| 166 |
+
|
| 167 |
+
for page_url, content in chunk:
|
| 168 |
+
pdf.cell(0, 10, txt=page_url, ln=True)
|
| 169 |
+
pdf.ln(5)
|
| 170 |
+
for text in content:
|
| 171 |
+
try:
|
| 172 |
+
pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
|
| 173 |
+
except Exception as e:
|
| 174 |
+
logger.error(f"Error writing text to PDF: {str(e)}")
|
| 175 |
+
if pdf.get_y() > 250: # Add a new page if the current page is almost full
|
| 176 |
+
pdf.add_page()
|
| 177 |
+
|
| 178 |
+
pdf.output(output_file)
|
| 179 |
+
|
| 180 |
+
def website_to_pdf(all_pages, progress_callback):
|
| 181 |
+
logger.info(f"Starting PDF generation for {len(all_pages)} pages")
|
| 182 |
|
| 183 |
+
chunk_size = 100
|
| 184 |
+
total_chunks = (len(all_pages) + chunk_size - 1) // chunk_size
|
| 185 |
+
temp_files = []
|
| 186 |
+
|
| 187 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 188 |
+
for i in range(0, len(all_pages), chunk_size):
|
| 189 |
+
chunk = all_pages[i:i+chunk_size]
|
| 190 |
+
temp_file = os.path.join(temp_dir, f"chunk_{i}.pdf")
|
| 191 |
+
generate_pdf_chunk(chunk, temp_file)
|
| 192 |
+
temp_files.append(temp_file)
|
| 193 |
+
|
| 194 |
+
progress = min((i + chunk_size) / len(all_pages), 1.0)
|
| 195 |
+
progress_callback(f"Processing pages... {progress:.0%}")
|
| 196 |
+
logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
|
| 197 |
+
|
| 198 |
+
logger.info("Merging PDF chunks...")
|
| 199 |
+
output_pdf = os.path.join(temp_dir, "final.pdf")
|
| 200 |
+
merger = PdfMerger()
|
| 201 |
+
for temp_file in temp_files:
|
| 202 |
+
merger.append(temp_file)
|
| 203 |
+
merger.write(output_pdf)
|
| 204 |
+
merger.close()
|
| 205 |
+
|
| 206 |
+
logger.info("PDF generation complete. Reading final PDF...")
|
| 207 |
+
with open(output_pdf, 'rb') as f:
|
| 208 |
+
return f.read()
|
| 209 |
+
|
| 210 |
+
async def process_url(url, depth, progress_callback):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
try:
|
| 212 |
+
all_pages = await crawl_pages(url, depth)
|
| 213 |
+
if not all_pages:
|
| 214 |
+
return "No pages were successfully crawled. Please check the URL and try again."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
+
logger.info("Crawling complete. Starting PDF generation...")
|
| 217 |
+
# Use ThreadPoolExecutor to run PDF generation in a separate thread
|
| 218 |
+
loop = asyncio.get_event_loop()
|
| 219 |
+
pdf_content = await loop.run_in_executor(executor, website_to_pdf, all_pages, progress_callback)
|
| 220 |
+
logger.info("PDF generation complete.")
|
| 221 |
+
return pdf_content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
except Exception as e:
|
| 223 |
+
logger.error(f"Error in process_url: {str(e)}")
|
| 224 |
+
return f"An error occurred: {str(e)}"
|
| 225 |
|
| 226 |
# App layout
|
| 227 |
app.layout = dbc.Container([
|
| 228 |
+
dcc.Store(id='pdf-store'),
|
| 229 |
+
dcc.Store(id='progress-store'),
|
| 230 |
+
dbc.Card(
|
| 231 |
+
dbc.CardBody([
|
| 232 |
+
html.H1("Website to PDF Converter", className="text-center mb-4"),
|
| 233 |
+
html.P("Enter docs URL and crawl depth to convert documentation pages into a PDF. Be responsible for sites you have permission to do this", className="text-center mb-4"),
|
| 234 |
+
dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
|
| 235 |
+
dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
|
| 236 |
+
dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
|
| 237 |
+
dbc.Button("Download PDF", id="download-button", color="secondary", className="mb-3 w-100", disabled=True),
|
| 238 |
+
html.Div([
|
| 239 |
+
dbc.Spinner(html.Div(id="progress-message"), color="primary", type="grow", size="lg"),
|
| 240 |
+
], className="text-center mb-3"),
|
| 241 |
+
dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
|
| 242 |
+
dcc.Download(id="download-pdf")
|
| 243 |
]),
|
| 244 |
+
className="mt-4"
|
| 245 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
], fluid=True)
|
| 247 |
|
| 248 |
+
def update_output(n_clicks, n_intervals, progress_data, url, depth):
|
| 249 |
+
ctx = dash.callback_context
|
| 250 |
+
if not ctx.triggered:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
raise PreventUpdate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
+
triggered_id = ctx.triggered[0]['prop_id'].split('.')[0]
|
| 254 |
+
|
| 255 |
+
if triggered_id == "submit-button":
|
| 256 |
+
if not url:
|
| 257 |
+
return True, "secondary", True, None, "Please enter a URL"
|
| 258 |
+
|
| 259 |
+
# Start the background task
|
| 260 |
+
task_id = str(uuid.uuid4())
|
| 261 |
+
executor.submit(background_task, url, depth, task_id)
|
| 262 |
+
|
| 263 |
+
return True, "secondary", False, None, "Processing... Please wait."
|
| 264 |
+
|
| 265 |
+
elif triggered_id == "progress-interval" or triggered_id == "progress-store":
|
| 266 |
+
if progress_data is None:
|
| 267 |
+
return True, "secondary", False, None, "Processing... Please wait."
|
| 268 |
+
|
| 269 |
+
if isinstance(progress_data, str):
|
| 270 |
+
if progress_data.startswith("Error"):
|
| 271 |
+
return True, "secondary", True, None, progress_data
|
| 272 |
+
else:
|
| 273 |
+
return True, "secondary", False, None, progress_data
|
| 274 |
+
|
| 275 |
+
if isinstance(progress_data, bytes):
|
| 276 |
+
encoded = base64.b64encode(progress_data).decode()
|
| 277 |
+
return False, "primary", True, encoded, "PDF ready for download!"
|
| 278 |
+
|
| 279 |
+
return True, "secondary", False, None, ""
|
| 280 |
|
| 281 |
@app.callback(
|
| 282 |
+
Output("download-pdf", "data"),
|
| 283 |
+
Input("download-button", "n_clicks"),
|
| 284 |
+
State("pdf-store", "data"),
|
| 285 |
+
prevent_initial_call=True
|
| 286 |
)
|
| 287 |
+
def download_pdf(n_clicks, pdf_data):
|
| 288 |
+
if pdf_data is None:
|
| 289 |
raise PreventUpdate
|
| 290 |
+
|
| 291 |
+
decoded = base64.b64decode(pdf_data)
|
| 292 |
+
return dcc.send_bytes(decoded, f"website_content_{int(time.time())}.pdf")
|
| 293 |
+
|
| 294 |
+
def background_task(url, depth, task_id):
|
| 295 |
+
def progress_callback(message):
|
| 296 |
+
# Update progress in the progress-store
|
| 297 |
+
app.layout.children[1].data = message
|
| 298 |
+
|
| 299 |
+
try:
|
| 300 |
+
logger.info(f"Starting background task for URL: {url}, depth: {depth}")
|
| 301 |
+
pdf_content = asyncio.run(process_url(url, depth, progress_callback))
|
| 302 |
+
logger.info("Background task completed successfully")
|
| 303 |
+
# Store the PDF content directly in the progress-store
|
| 304 |
+
app.layout.children[1].data = pdf_content
|
| 305 |
+
except Exception as e:
|
| 306 |
+
logger.error(f"Error in background task: {str(e)}")
|
| 307 |
+
app.layout.children[1].data = f"Error: {str(e)}"
|
| 308 |
|
| 309 |
if __name__ == '__main__':
|
| 310 |
print("Starting the Dash application...")
|