bluenevus commited on
Commit
8ccfdbd
·
verified ·
1 Parent(s): d2ab8de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -272
app.py CHANGED
@@ -2,299 +2,309 @@ import dash
2
  from dash import dcc, html, Input, Output, State
3
  import dash_bootstrap_components as dbc
4
  from dash.exceptions import PreventUpdate
5
- import google.generativeai as genai
6
- from github import Github
7
- import gitlab
8
  import requests
9
- import tempfile
10
- import docx
11
- import os
 
12
  import logging
13
- import threading
14
- from huggingface_hub import HfApi
15
- from flask import send_file
16
-
17
- # Configure logging
18
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
- logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
20
 
21
  # Initialize Dash app
22
  app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
23
- server = app.server # Expose the Flask server
24
-
25
- # Hugging Face API setup
26
- hf_api = HfApi()
27
-
28
- # Get Hugging Face variables
29
- GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
30
- GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY')
31
-
32
- def is_ui_file(filename):
33
- ui_extensions = ['.erb', '.haml', '.slim', '.php', '.aspx', '.jsp', '.ftl', '.twig', '.mustache', '.handlebars', '.ejs', '.pug', '.blade.php', '.xhtml', '.fxml', '.tsx', '.jsx', '.vue', '.html', '.cshtml', '.razor', '.xaml', '.jsx']
34
- return any(filename.endswith(ext) for ext in ui_extensions)
35
-
36
- def get_file_contents(git_provider, repo_url, exclude_folders):
37
- file_contents = []
38
- logger.info(f"Fetching files from {git_provider} repository: {repo_url}")
39
- exclude_folders = [folder.strip() for folder in exclude_folders.split(',') if folder.strip()]
40
- if git_provider == "GitHub":
41
- g = Github(GITHUB_TOKEN)
42
- repo = g.get_repo(repo_url)
43
- contents = repo.get_contents("")
44
- while contents:
45
- file_content = contents.pop(0)
46
- if file_content.type == "dir":
47
- if not any(file_content.path.startswith(folder) for folder in exclude_folders):
48
- contents.extend(repo.get_contents(file_content.path))
49
- elif is_ui_file(file_content.name) and not any(file_content.path.startswith(folder) for folder in exclude_folders):
50
- logger.info(f"Found UI file: {file_content.path}")
51
- file_contents.append((file_content.path, file_content.decoded_content.decode('utf-8', errors='ignore')))
52
- elif git_provider == "GitLab":
53
- gl = gitlab.Gitlab(url='https://gitlab.com', private_token=GITHUB_TOKEN)
54
- project = gl.projects.get(repo_url)
55
- items = project.repository_tree(recursive=True)
56
- for item in items:
57
- if item['type'] == 'blob' and is_ui_file(item['name']) and not any(item['path'].startswith(folder) for folder in exclude_folders):
58
- logger.info(f"Found UI file: {item['path']}")
59
- file_content = project.files.get(item['path'], ref='main')
60
- file_contents.append((item['path'], file_content.decode().decode('utf-8', errors='ignore')))
61
- elif git_provider == "Gitea":
62
- base_url = "https://gitea.com/api/v1"
63
- headers = {"Authorization": f"token {GITHUB_TOKEN}"}
64
- def recursive_get_contents(path=""):
65
- response = requests.get(f"{base_url}/repos/{repo_url}/contents/{path}", headers=headers)
66
- response.raise_for_status()
67
- for item in response.json():
68
- if item['type'] == 'file' and is_ui_file(item['name']) and not any(item['path'].startswith(folder) for folder in exclude_folders):
69
- logger.info(f"Found UI file: {item['path']}")
70
- file_content = requests.get(item['download_url']).text
71
- file_contents.append((item['path'], file_content))
72
- elif item['type'] == 'dir' and not any(item['path'].startswith(folder) for folder in exclude_folders):
73
- recursive_get_contents(item['path'])
74
- recursive_get_contents()
75
- else:
76
- raise ValueError("Unsupported Git provider")
77
- logger.info(f"Total UI files found: {len(file_contents)}")
78
- return file_contents
79
-
80
- def generate_guide_section(file_path, file_content, guide_type):
81
- logger.info(f"Generating {guide_type} section for file: {file_path}")
82
- genai.configure(api_key=GEMINI_API_KEY)
83
- model = genai.GenerativeModel('gemini-2.0-flash-lite')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- if guide_type == "User Guide":
86
- prompt = f"""Based on the following UI-related code file, generate a section for a user guide:
87
-
88
- File: {file_path}
89
- Content:
90
- {file_content}
91
-
92
- Please focus on:
93
- 1. The specific features and functionality this UI component provides to the end users
94
- 2. Step-by-step instructions on how to use these features
95
- 3. Any user interactions or inputs required
96
- 4. Expected outcomes or results for the user
97
-
98
- Important formatting instructions:
99
- - The output should be in plain text no markdown for example do not use * or ** or # or ##. Instead use numbers like 1., 2. for bullets
100
- - Use clear section titles
101
- - Follow this numbering heirarchy (1.0, 1.1, 1.2), (2.0, 2.1, 2.2), (3.0, 3.1, 3.2)
102
- - Explain the purpose and benefit of each feature for non-technical users
103
- - This is an end user manual, not a system administration manual so focus on the end user components
104
- """
105
- else: # Administration Guide
106
- prompt = f"""Based on the following UI-related code file, generate a section for an System guide:
107
-
108
- File: {file_path}
109
- Content:
110
- {file_content}
111
-
112
- Please focus on explaining what that component is and does:
113
- 1. Any configuration options or settings related to this UI component
114
- 2. Security considerations or access control related to this feature
115
- 3. How to monitor or troubleshoot issues with this component
116
- 4. Best practices for managing and maintaining this part of the system
117
-
118
- Important formatting instructions:
119
- - The output should be in plain text no markdown for example for example do not use * or ** or # or ##. Instead use numbers like 1., 2. for bullets
120
- - Use clear section titles
121
- - Use clear section titles that has the name of the file in parenthesis
122
- - Follow this numbering heirarchy (1.0, 1.1, 1.2), (2.0, 2.1, 2.2), (3.0, 3.1, 3.2)
123
- - Explain the purpose and implications of each component
124
- """
125
-
126
- response = model.generate_content(prompt)
127
- logger.info(f"Generated {guide_type} section for {file_path}")
128
- return response.text
129
-
130
- def generate_guide(git_provider, repo_url, guide_type, exclude_folders):
131
  try:
132
- logger.info(f"Starting guide generation for {repo_url}")
133
- file_contents = get_file_contents(git_provider, repo_url, exclude_folders)
134
-
135
- guide_sections = []
136
- for file_path, content in file_contents:
137
- section = generate_guide_section(file_path, content, guide_type)
138
- guide_sections.append(section)
139
- logger.info(f"Added section for {file_path}")
140
-
141
- full_guide = f"# {guide_type}\n\n" + "\n\n".join(guide_sections)
142
-
143
- logger.info("Creating DOCX file")
144
- doc = docx.Document()
145
- doc.add_heading(guide_type, 0)
146
-
147
- for line in full_guide.split('\n'):
148
- line = line.strip()
149
- if line.startswith('# '):
150
- doc.add_heading(line[2:], level=1)
151
- elif line.startswith('## '):
152
- doc.add_heading(line[3:], level=2)
153
- elif line.startswith('Step'):
154
- doc.add_paragraph(line, style='List Number')
155
- else:
156
- doc.add_paragraph(line)
157
 
158
- with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_docx:
159
- doc.save(temp_docx.name)
160
- docx_path = temp_docx.name
161
- logger.info(f"DOCX file saved: {docx_path}")
162
-
163
- logger.info("Creating Markdown file")
164
- with tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8') as temp_md:
165
- temp_md.write(full_guide)
166
- md_path = temp_md.name
167
- logger.info(f"Markdown file saved: {md_path}")
168
-
169
- logger.info("Guide generation completed successfully")
170
- return full_guide, docx_path, md_path
171
-
172
  except Exception as e:
173
- logger.error(f"An error occurred: {str(e)}", exc_info=True)
174
- return f"An error occurred: {str(e)}", None, None
175
 
176
  # App layout
177
  app.layout = dbc.Container([
178
- dbc.Navbar(
179
- dbc.Container([
180
- html.A(
181
- dbc.Row([
182
- dbc.Col(html.Img(src="/assets/logo.png", height="30px")),
183
- dbc.Col(dbc.NavbarBrand("Automated Guide Generator", className="ms-2")),
184
- ],
185
- align="center",
186
- className="g-0",
187
- ),
188
- href="/",
189
- style={"textDecoration": "none"},
190
- )
 
 
191
  ]),
192
- color="primary",
193
- dark=True,
194
- ),
195
-
196
- dbc.Row([
197
- dbc.Col([
198
- html.H1("Automated Guide Generator", className="text-center my-4"),
199
- html.P("Generate a user guide or administration guide based on the UI-related code in a Git repository using Gemini AI. Select a Git provider, enter repository details, choose the guide type, and let AI create a comprehensive guide.", className="text-center mb-4"),
200
-
201
- dbc.Card([
202
- dbc.CardBody([
203
- dbc.Form([
204
- dbc.Select(
205
- id="git-provider",
206
- options=[
207
- {"label": "GitHub", "value": "GitHub"},
208
- {"label": "GitLab", "value": "GitLab"},
209
- {"label": "Gitea", "value": "Gitea"}
210
- ],
211
- placeholder="Select Git Provider",
212
- ),
213
- dbc.Input(id="repo-url", type="text", placeholder="Repository URL (owner/repo)"),
214
- dbc.RadioItems(
215
- id="guide-type",
216
- options=[
217
- {"label": "User Guide", "value": "User Guide"},
218
- {"label": "Administration Guide", "value": "Administration Guide"}
219
- ],
220
- inline=True,
221
- ),
222
- dbc.Input(id="exclude-folders", type="text", placeholder="Exclude Folders (comma-separated)"),
223
- dbc.Button("Generate Guide", id="generate-button", color="primary", className="mt-3"),
224
- ])
225
- ])
226
- ], className="mb-4"),
227
-
228
- dbc.Spinner(
229
- dbc.Card([
230
- dbc.CardBody([
231
- html.H4("Generated Guide", className="card-title"),
232
- html.Div([
233
- dbc.Button("Download DOCX", id="download-docx", color="secondary", className="me-2"),
234
- dbc.Button("Download Markdown", id="download-md", color="secondary"),
235
- ], className="mt-3"),
236
- dcc.Download(id="download-docx-file"),
237
- dcc.Download(id="download-md-file"),
238
- ])
239
- ], className="mt-4"),
240
- color="primary",
241
- ),
242
- ], width=6),
243
- dbc.Col([
244
- dbc.Card([
245
- dbc.CardBody([
246
- html.H4("Preview", className="card-title"),
247
- html.Div(id="generated-guide", style={"whiteSpace": "pre-wrap", "height": "400px", "overflowY": "auto"}),
248
- ])
249
- ], className="mt-4"),
250
- ], width=6),
251
- ])
252
  ], fluid=True)
253
 
254
- @app.callback(
255
- [Output("generated-guide", "children"),
256
- Output("download-docx", "n_clicks"),
257
- Output("download-md", "n_clicks")],
258
- [Input("generate-button", "n_clicks")],
259
- [State("git-provider", "value"),
260
- State("repo-url", "value"),
261
- State("guide-type", "value"),
262
- State("exclude-folders", "value")]
263
- )
264
- def update_output(n_clicks, git_provider, repo_url, guide_type, exclude_folders):
265
- if n_clicks is None:
266
  raise PreventUpdate
267
-
268
- def generate_guide_thread():
269
- nonlocal guide_text, docx_path, md_path
270
- guide_text, docx_path, md_path = generate_guide(git_provider, repo_url, guide_type, exclude_folders)
271
-
272
- guide_text, docx_path, md_path = None, None, None
273
- thread = threading.Thread(target=generate_guide_thread)
274
- thread.start()
275
- thread.join()
276
-
277
- return guide_text, 0, 0 # Reset n_clicks for download buttons
278
 
279
- @app.callback(
280
- Output("download-docx-file", "data"),
281
- Input("download-docx", "n_clicks"),
282
- prevent_initial_call=True,
283
- )
284
- def download_docx(n_clicks):
285
- if n_clicks is None:
286
- raise PreventUpdate
287
- return dcc.send_file(docx_path, filename="generated_guide.docx")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
  @app.callback(
290
- Output("download-md-file", "data"),
291
- Input("download-md", "n_clicks"),
292
- prevent_initial_call=True,
 
293
  )
294
- def download_md(n_clicks):
295
- if n_clicks is None:
296
  raise PreventUpdate
297
- return dcc.send_file(md_path, filename="generated_guide.md")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
  if __name__ == '__main__':
300
  print("Starting the Dash application...")
 
2
  from dash import dcc, html, Input, Output, State
3
  import dash_bootstrap_components as dbc
4
  from dash.exceptions import PreventUpdate
5
+ import base64
 
 
6
  import requests
7
+ from bs4 import BeautifulSoup
8
+ from urllib.parse import urljoin, urlparse
9
+ from fpdf import FPDF
10
+ import re
11
  import logging
12
+ import asyncio
13
+ import aiohttp
14
+ from aiolimiter import AsyncLimiter
15
+ import sqlite3
16
+ from contextlib import contextmanager
17
+ from threading import local
18
+ import time
19
+ import os
20
+ import ssl
21
+ from io import BytesIO
22
+ import tempfile
23
+ import uuid
24
+ from concurrent.futures import ThreadPoolExecutor
25
+ from PyPDF2 import PdfMerger
26
 
27
  # Initialize Dash app
28
  app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
29
+ server = app.server
30
+
31
+ # Logging setup
32
+ logging.basicConfig(level=logging.INFO)
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # Thread-local storage for database connections
36
+ thread_local = local()
37
+
38
+ # Rate limiter: 10 requests per second
39
+ rate_limiter = AsyncLimiter(10, 1)
40
+
41
+ # Create an SSL context that ignores certificate verification
42
+ ssl_context = ssl.create_default_context()
43
+ ssl_context.check_hostname = False
44
+ ssl_context.verify_mode = ssl.CERT_NONE
45
+
46
+ # ThreadPoolExecutor for background tasks
47
+ executor = ThreadPoolExecutor(max_workers=4)
48
+
49
+ @contextmanager
50
+ def get_db_connection():
51
+ if not hasattr(thread_local, "connection"):
52
+ thread_local.connection = sqlite3.connect('crawl_cache.db')
53
+ try:
54
+ yield thread_local.connection
55
+ finally:
56
+ pass # We'll keep the connection open for reuse
57
+
58
+ def init_db():
59
+ with get_db_connection() as conn:
60
+ c = conn.cursor()
61
+ c.execute('''CREATE TABLE IF NOT EXISTS pages
62
+ (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
63
+ c.execute('''CREATE INDEX IF NOT EXISTS idx_url ON pages(url)''')
64
+ conn.commit()
65
+
66
+ init_db()
67
+
68
+ def clean_text(text):
69
+ text = ''.join(char for char in text if char.isprintable())
70
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text)
71
+ return text
72
+
73
+ async def get_page_content(session, url):
74
+ try:
75
+ async with rate_limiter:
76
+ async with session.get(url, timeout=30) as response:
77
+ if response.status == 200:
78
+ text = await response.text()
79
+ soup = BeautifulSoup(text, 'html.parser')
80
+ content = []
81
+ main_content = soup.find('article') or soup.find('main') or soup
82
+ if main_content:
83
+ for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
84
+ for element in main_content.find_all(tag):
85
+ text = clean_text(element.get_text(strip=True))
86
+ if text:
87
+ content.append(text)
88
+ logger.info(f"Found {len(content)} content items for {url}")
89
+ return content
90
+ else:
91
+ logger.error(f"Error fetching {url}: HTTP {response.status}")
92
+ return [f"Error fetching {url}: HTTP {response.status}"]
93
+ except Exception as e:
94
+ logger.error(f"Error processing {url}: {str(e)}")
95
+ return [f"Error processing {url}: {str(e)}"]
96
+
97
+ async def get_links(session, url, base_url):
98
+ try:
99
+ async with rate_limiter:
100
+ async with session.get(url, timeout=30) as response:
101
+ if response.status == 200:
102
+ text = await response.text()
103
+ soup = BeautifulSoup(text, 'html.parser')
104
+ links = soup.find_all('a', href=True)
105
+ valid_links = []
106
+ for link in links:
107
+ full_url = urljoin(url, link['href'])
108
+ if full_url.startswith(base_url) and full_url != url:
109
+ valid_links.append(full_url)
110
+ return valid_links
111
+ else:
112
+ logger.error(f"Error fetching links from {url}: HTTP {response.status}")
113
+ return []
114
+ except Exception as e:
115
+ logger.error(f"Error getting links from {url}: {str(e)}")
116
+ return []
117
+
118
+ async def crawl_pages(base_url, max_depth):
119
+ visited = set()
120
+ to_visit = [(base_url, 0)]
121
+ all_pages = []
122
+
123
+ async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl_context)) as session:
124
+ while to_visit:
125
+ current_url, depth = to_visit.pop(0)
126
+ if current_url in visited or depth > max_depth:
127
+ continue
128
+
129
+ visited.add(current_url)
130
+ start_time = time.time()
131
+
132
+ try:
133
+ with get_db_connection() as conn:
134
+ c = conn.cursor()
135
+ c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
136
+ result = c.fetchone()
137
+
138
+ if result:
139
+ content = eval(result[0]) # Convert string back to list
140
+ else:
141
+ content = await get_page_content(session, current_url)
142
+ with get_db_connection() as conn:
143
+ c = conn.cursor()
144
+ c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
145
+ conn.commit()
146
+
147
+ all_pages.append((current_url, content))
148
+ logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
149
+
150
+ if depth < max_depth:
151
+ links = await get_links(session, current_url, base_url)
152
+ for link in links:
153
+ if link not in visited:
154
+ to_visit.append((link, depth + 1))
155
+ except Exception as e:
156
+ logger.error(f"Error processing {current_url}: {str(e)}")
157
+ # Continue with the next URL even if this one fails
158
+
159
+ return all_pages
160
+
161
+ def generate_pdf_chunk(chunk, output_file):
162
+ pdf = FPDF()
163
+ pdf.set_auto_page_break(auto=True, margin=15)
164
+ pdf.add_page()
165
+ pdf.set_font("Arial", size=12)
166
+
167
+ for page_url, content in chunk:
168
+ pdf.cell(0, 10, txt=page_url, ln=True)
169
+ pdf.ln(5)
170
+ for text in content:
171
+ try:
172
+ pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
173
+ except Exception as e:
174
+ logger.error(f"Error writing text to PDF: {str(e)}")
175
+ if pdf.get_y() > 250: # Add a new page if the current page is almost full
176
+ pdf.add_page()
177
+
178
+ pdf.output(output_file)
179
+
180
+ def website_to_pdf(all_pages, progress_callback):
181
+ logger.info(f"Starting PDF generation for {len(all_pages)} pages")
182
 
183
+ chunk_size = 100
184
+ total_chunks = (len(all_pages) + chunk_size - 1) // chunk_size
185
+ temp_files = []
186
+
187
+ with tempfile.TemporaryDirectory() as temp_dir:
188
+ for i in range(0, len(all_pages), chunk_size):
189
+ chunk = all_pages[i:i+chunk_size]
190
+ temp_file = os.path.join(temp_dir, f"chunk_{i}.pdf")
191
+ generate_pdf_chunk(chunk, temp_file)
192
+ temp_files.append(temp_file)
193
+
194
+ progress = min((i + chunk_size) / len(all_pages), 1.0)
195
+ progress_callback(f"Processing pages... {progress:.0%}")
196
+ logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
197
+
198
+ logger.info("Merging PDF chunks...")
199
+ output_pdf = os.path.join(temp_dir, "final.pdf")
200
+ merger = PdfMerger()
201
+ for temp_file in temp_files:
202
+ merger.append(temp_file)
203
+ merger.write(output_pdf)
204
+ merger.close()
205
+
206
+ logger.info("PDF generation complete. Reading final PDF...")
207
+ with open(output_pdf, 'rb') as f:
208
+ return f.read()
209
+
210
+ async def process_url(url, depth, progress_callback):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  try:
212
+ all_pages = await crawl_pages(url, depth)
213
+ if not all_pages:
214
+ return "No pages were successfully crawled. Please check the URL and try again."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
+ logger.info("Crawling complete. Starting PDF generation...")
217
+ # Use ThreadPoolExecutor to run PDF generation in a separate thread
218
+ loop = asyncio.get_event_loop()
219
+ pdf_content = await loop.run_in_executor(executor, website_to_pdf, all_pages, progress_callback)
220
+ logger.info("PDF generation complete.")
221
+ return pdf_content
 
 
 
 
 
 
 
 
222
  except Exception as e:
223
+ logger.error(f"Error in process_url: {str(e)}")
224
+ return f"An error occurred: {str(e)}"
225
 
226
  # App layout
227
  app.layout = dbc.Container([
228
+ dcc.Store(id='pdf-store'),
229
+ dcc.Store(id='progress-store'),
230
+ dbc.Card(
231
+ dbc.CardBody([
232
+ html.H1("Website to PDF Converter", className="text-center mb-4"),
233
+ html.P("Enter docs URL and crawl depth to convert documentation pages into a PDF. Be responsible for sites you have permission to do this", className="text-center mb-4"),
234
+ dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
235
+ dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
236
+ dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
237
+ dbc.Button("Download PDF", id="download-button", color="secondary", className="mb-3 w-100", disabled=True),
238
+ html.Div([
239
+ dbc.Spinner(html.Div(id="progress-message"), color="primary", type="grow", size="lg"),
240
+ ], className="text-center mb-3"),
241
+ dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
242
+ dcc.Download(id="download-pdf")
243
  ]),
244
+ className="mt-4"
245
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  ], fluid=True)
247
 
248
+ def update_output(n_clicks, n_intervals, progress_data, url, depth):
249
+ ctx = dash.callback_context
250
+ if not ctx.triggered:
 
 
 
 
 
 
 
 
 
251
  raise PreventUpdate
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ triggered_id = ctx.triggered[0]['prop_id'].split('.')[0]
254
+
255
+ if triggered_id == "submit-button":
256
+ if not url:
257
+ return True, "secondary", True, None, "Please enter a URL"
258
+
259
+ # Start the background task
260
+ task_id = str(uuid.uuid4())
261
+ executor.submit(background_task, url, depth, task_id)
262
+
263
+ return True, "secondary", False, None, "Processing... Please wait."
264
+
265
+ elif triggered_id == "progress-interval" or triggered_id == "progress-store":
266
+ if progress_data is None:
267
+ return True, "secondary", False, None, "Processing... Please wait."
268
+
269
+ if isinstance(progress_data, str):
270
+ if progress_data.startswith("Error"):
271
+ return True, "secondary", True, None, progress_data
272
+ else:
273
+ return True, "secondary", False, None, progress_data
274
+
275
+ if isinstance(progress_data, bytes):
276
+ encoded = base64.b64encode(progress_data).decode()
277
+ return False, "primary", True, encoded, "PDF ready for download!"
278
+
279
+ return True, "secondary", False, None, ""
280
 
281
  @app.callback(
282
+ Output("download-pdf", "data"),
283
+ Input("download-button", "n_clicks"),
284
+ State("pdf-store", "data"),
285
+ prevent_initial_call=True
286
  )
287
+ def download_pdf(n_clicks, pdf_data):
288
+ if pdf_data is None:
289
  raise PreventUpdate
290
+
291
+ decoded = base64.b64decode(pdf_data)
292
+ return dcc.send_bytes(decoded, f"website_content_{int(time.time())}.pdf")
293
+
294
+ def background_task(url, depth, task_id):
295
+ def progress_callback(message):
296
+ # Update progress in the progress-store
297
+ app.layout.children[1].data = message
298
+
299
+ try:
300
+ logger.info(f"Starting background task for URL: {url}, depth: {depth}")
301
+ pdf_content = asyncio.run(process_url(url, depth, progress_callback))
302
+ logger.info("Background task completed successfully")
303
+ # Store the PDF content directly in the progress-store
304
+ app.layout.children[1].data = pdf_content
305
+ except Exception as e:
306
+ logger.error(f"Error in background task: {str(e)}")
307
+ app.layout.children[1].data = f"Error: {str(e)}"
308
 
309
  if __name__ == '__main__':
310
  print("Starting the Dash application...")