Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -195,118 +195,4 @@ def website_to_pdf(all_pages, progress_callback):
|
|
| 195 |
progress_callback(f"Processing pages... {progress:.0%}")
|
| 196 |
logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
|
| 197 |
|
| 198 |
-
logger.info("
|
| 199 |
-
output_pdf = os.path.join(temp_dir, "final.pdf")
|
| 200 |
-
merger = PdfMerger()
|
| 201 |
-
for temp_file in temp_files:
|
| 202 |
-
merger.append(temp_file)
|
| 203 |
-
merger.write(output_pdf)
|
| 204 |
-
merger.close()
|
| 205 |
-
|
| 206 |
-
logger.info("PDF generation complete. Reading final PDF...")
|
| 207 |
-
with open(output_pdf, 'rb') as f:
|
| 208 |
-
return f.read()
|
| 209 |
-
|
| 210 |
-
async def process_url(url, depth, progress_callback):
|
| 211 |
-
try:
|
| 212 |
-
all_pages = await crawl_pages(url, depth)
|
| 213 |
-
if not all_pages:
|
| 214 |
-
return "No pages were successfully crawled. Please check the URL and try again."
|
| 215 |
-
|
| 216 |
-
logger.info("Crawling complete. Starting PDF generation...")
|
| 217 |
-
# Use ThreadPoolExecutor to run PDF generation in a separate thread
|
| 218 |
-
loop = asyncio.get_event_loop()
|
| 219 |
-
pdf_content = await loop.run_in_executor(executor, website_to_pdf, all_pages, progress_callback)
|
| 220 |
-
logger.info("PDF generation complete.")
|
| 221 |
-
return pdf_content
|
| 222 |
-
except Exception as e:
|
| 223 |
-
logger.error(f"Error in process_url: {str(e)}")
|
| 224 |
-
return f"An error occurred: {str(e)}"
|
| 225 |
-
|
| 226 |
-
# App layout
|
| 227 |
-
app.layout = dbc.Container([
|
| 228 |
-
dcc.Store(id='pdf-store'),
|
| 229 |
-
dcc.Store(id='progress-store'),
|
| 230 |
-
dbc.Card(
|
| 231 |
-
dbc.CardBody([
|
| 232 |
-
html.H1("Website to PDF Converter", className="text-center mb-4"),
|
| 233 |
-
html.P("Enter docs URL and crawl depth to convert documentation pages into a PDF. Be responsible for sites you have permission to do this", className="text-center mb-4"),
|
| 234 |
-
dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
|
| 235 |
-
dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
|
| 236 |
-
dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
|
| 237 |
-
dbc.Button("Download PDF", id="download-button", color="secondary", className="mb-3 w-100", disabled=True),
|
| 238 |
-
html.Div([
|
| 239 |
-
dbc.Spinner(html.Div(id="progress-message"), color="primary", type="grow", size="lg"),
|
| 240 |
-
], className="text-center mb-3"),
|
| 241 |
-
dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
|
| 242 |
-
dcc.Download(id="download-pdf")
|
| 243 |
-
]),
|
| 244 |
-
className="mt-4"
|
| 245 |
-
)
|
| 246 |
-
], fluid=True)
|
| 247 |
-
|
| 248 |
-
def update_output(n_clicks, n_intervals, progress_data, url, depth):
|
| 249 |
-
ctx = dash.callback_context
|
| 250 |
-
if not ctx.triggered:
|
| 251 |
-
raise PreventUpdate
|
| 252 |
-
|
| 253 |
-
triggered_id = ctx.triggered[0]['prop_id'].split('.')[0]
|
| 254 |
-
|
| 255 |
-
if triggered_id == "submit-button":
|
| 256 |
-
if not url:
|
| 257 |
-
return True, "secondary", True, None, "Please enter a URL"
|
| 258 |
-
|
| 259 |
-
# Start the background task
|
| 260 |
-
task_id = str(uuid.uuid4())
|
| 261 |
-
executor.submit(background_task, url, depth, task_id)
|
| 262 |
-
|
| 263 |
-
return True, "secondary", False, None, "Processing... Please wait."
|
| 264 |
-
|
| 265 |
-
elif triggered_id == "progress-interval" or triggered_id == "progress-store":
|
| 266 |
-
if progress_data is None:
|
| 267 |
-
return True, "secondary", False, None, "Processing... Please wait."
|
| 268 |
-
|
| 269 |
-
if isinstance(progress_data, str):
|
| 270 |
-
if progress_data.startswith("Error"):
|
| 271 |
-
return True, "secondary", True, None, progress_data
|
| 272 |
-
else:
|
| 273 |
-
return True, "secondary", False, None, progress_data
|
| 274 |
-
|
| 275 |
-
if isinstance(progress_data, bytes):
|
| 276 |
-
encoded = base64.b64encode(progress_data).decode()
|
| 277 |
-
return False, "primary", True, encoded, "PDF ready for download!"
|
| 278 |
-
|
| 279 |
-
return True, "secondary", False, None, ""
|
| 280 |
-
|
| 281 |
-
@app.callback(
|
| 282 |
-
Output("download-pdf", "data"),
|
| 283 |
-
Input("download-button", "n_clicks"),
|
| 284 |
-
State("pdf-store", "data"),
|
| 285 |
-
prevent_initial_call=True
|
| 286 |
-
)
|
| 287 |
-
def download_pdf(n_clicks, pdf_data):
|
| 288 |
-
if pdf_data is None:
|
| 289 |
-
raise PreventUpdate
|
| 290 |
-
|
| 291 |
-
decoded = base64.b64decode(pdf_data)
|
| 292 |
-
return dcc.send_bytes(decoded, f"website_content_{int(time.time())}.pdf")
|
| 293 |
-
|
| 294 |
-
def background_task(url, depth, task_id):
|
| 295 |
-
def progress_callback(message):
|
| 296 |
-
# Update progress in the progress-store
|
| 297 |
-
app.layout.children[1].data = message
|
| 298 |
-
|
| 299 |
-
try:
|
| 300 |
-
logger.info(f"Starting background task for URL: {url}, depth: {depth}")
|
| 301 |
-
pdf_content = asyncio.run(process_url(url, depth, progress_callback))
|
| 302 |
-
logger.info("Background task completed successfully")
|
| 303 |
-
# Store the PDF content directly in the progress-store
|
| 304 |
-
app.layout.children[1].data = pdf_content
|
| 305 |
-
except Exception as e:
|
| 306 |
-
logger.error(f"Error in background task: {str(e)}")
|
| 307 |
-
app.layout.children[1].data = f"Error: {str(e)}"
|
| 308 |
-
|
| 309 |
-
if __name__ == '__main__':
|
| 310 |
-
print("Starting the Dash application...")
|
| 311 |
-
app.run(debug=True, host='0.0.0.0', port=7860)
|
| 312 |
-
print("Dash application has finished running.")
|
|
|
|
| 195 |
progress_callback(f"Processing pages... {progress:.0%}")
|
| 196 |
logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
|
| 197 |
|
| 198 |
+
logger.info("Mer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|