Spaces:

MicroHealth
/

website-to-pdf

Sleeping

App Files Files Community

bluenevus commited on Apr 25, 2025

Commit

8ccfdbd

verified ·

1 Parent(s): d2ab8de

Update app.py

Browse files

Files changed (1) hide show

app.py +282 -272

app.py CHANGED Viewed

@@ -2,299 +2,309 @@ import dash
 from dash import dcc, html, Input, Output, State
 import dash_bootstrap_components as dbc
 from dash.exceptions import PreventUpdate
-import google.generativeai as genai
-from github import Github
-import gitlab
 import requests
-import tempfile
-import docx
-import os
 import logging
-import threading
-from huggingface_hub import HfApi
-from flask import send_file
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
 # Initialize Dash app
 app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
-server = app.server  # Expose the Flask server
-# Hugging Face API setup
-hf_api = HfApi()
-# Get Hugging Face variables
-GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
-GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY')
-def is_ui_file(filename):
-    ui_extensions = ['.erb', '.haml', '.slim', '.php', '.aspx', '.jsp', '.ftl', '.twig', '.mustache', '.handlebars', '.ejs', '.pug', '.blade.php', '.xhtml', '.fxml', '.tsx', '.jsx', '.vue', '.html', '.cshtml', '.razor', '.xaml', '.jsx']
-    return any(filename.endswith(ext) for ext in ui_extensions)
-def get_file_contents(git_provider, repo_url, exclude_folders):
-    file_contents = []
-    logger.info(f"Fetching files from {git_provider} repository: {repo_url}")
-    exclude_folders = [folder.strip() for folder in exclude_folders.split(',') if folder.strip()]
-    if git_provider == "GitHub":
-        g = Github(GITHUB_TOKEN)
-        repo = g.get_repo(repo_url)
-        contents = repo.get_contents("")
-        while contents:
-            file_content = contents.pop(0)
-            if file_content.type == "dir":
-                if not any(file_content.path.startswith(folder) for folder in exclude_folders):
-                    contents.extend(repo.get_contents(file_content.path))
-            elif is_ui_file(file_content.name) and not any(file_content.path.startswith(folder) for folder in exclude_folders):
-                logger.info(f"Found UI file: {file_content.path}")
-                file_contents.append((file_content.path, file_content.decoded_content.decode('utf-8', errors='ignore')))
-    elif git_provider == "GitLab":
-        gl = gitlab.Gitlab(url='https://gitlab.com', private_token=GITHUB_TOKEN)
-        project = gl.projects.get(repo_url)
-        items = project.repository_tree(recursive=True)
-        for item in items:
-            if item['type'] == 'blob' and is_ui_file(item['name']) and not any(item['path'].startswith(folder) for folder in exclude_folders):
-                logger.info(f"Found UI file: {item['path']}")
-                file_content = project.files.get(item['path'], ref='main')
-                file_contents.append((item['path'], file_content.decode().decode('utf-8', errors='ignore')))
-    elif git_provider == "Gitea":
-        base_url = "https://gitea.com/api/v1"
-        headers = {"Authorization": f"token {GITHUB_TOKEN}"}
-        def recursive_get_contents(path=""):
-            response = requests.get(f"{base_url}/repos/{repo_url}/contents/{path}", headers=headers)
-            response.raise_for_status()
-            for item in response.json():
-                if item['type'] == 'file' and is_ui_file(item['name']) and not any(item['path'].startswith(folder) for folder in exclude_folders):
-                    logger.info(f"Found UI file: {item['path']}")
-                    file_content = requests.get(item['download_url']).text
-                    file_contents.append((item['path'], file_content))
-                elif item['type'] == 'dir' and not any(item['path'].startswith(folder) for folder in exclude_folders):
-                    recursive_get_contents(item['path'])
-        recursive_get_contents()
-    else:
-        raise ValueError("Unsupported Git provider")
-    logger.info(f"Total UI files found: {len(file_contents)}")
-    return file_contents
-def generate_guide_section(file_path, file_content, guide_type):
-    logger.info(f"Generating {guide_type} section for file: {file_path}")
-    genai.configure(api_key=GEMINI_API_KEY)
-    model = genai.GenerativeModel('gemini-2.0-flash-lite')
-    if guide_type == "User Guide":
-        prompt = f"""Based on the following UI-related code file, generate a section for a user guide:
-        File: {file_path}
-        Content:
-        {file_content}
-        Please focus on:
-        1. The specific features and functionality this UI component provides to the end users
-        2. Step-by-step instructions on how to use these features
-        3. Any user interactions or inputs required
-        4. Expected outcomes or results for the user
-        Important formatting instructions:
-        - The output should be in plain text no markdown for example do not use * or ** or # or ##.  Instead use numbers like 1., 2. for bullets
-        - Use clear section titles
-        - Follow this numbering heirarchy (1.0, 1.1, 1.2), (2.0, 2.1, 2.2), (3.0, 3.1, 3.2)
-        - Explain the purpose and benefit of each feature for non-technical users
-        - This is an end user manual, not a system administration manual so focus on the end user components
-        """
-    else:  # Administration Guide
-        prompt = f"""Based on the following UI-related code file, generate a section for an System guide:
-        File: {file_path}
-        Content:
-        {file_content}
-        Please focus on explaining what that component is and does:
-        1. Any configuration options or settings related to this UI component
-        2. Security considerations or access control related to this feature
-        3. How to monitor or troubleshoot issues with this component
-        4. Best practices for managing and maintaining this part of the system
-        Important formatting instructions:
-        - The output should be in plain text no markdown for example for example do not use * or ** or # or ##.  Instead use numbers like 1., 2. for bullets
-        - Use clear section titles
-        - Use clear section titles that has the name of the file in parenthesis
-        - Follow this numbering heirarchy (1.0, 1.1, 1.2), (2.0, 2.1, 2.2), (3.0, 3.1, 3.2)
-        - Explain the purpose and implications of each component
-        """
-    response = model.generate_content(prompt)
-    logger.info(f"Generated {guide_type} section for {file_path}")
-    return response.text
-def generate_guide(git_provider, repo_url, guide_type, exclude_folders):
     try:
-        logger.info(f"Starting guide generation for {repo_url}")
-        file_contents = get_file_contents(git_provider, repo_url, exclude_folders)
-        guide_sections = []
-        for file_path, content in file_contents:
-            section = generate_guide_section(file_path, content, guide_type)
-            guide_sections.append(section)
-            logger.info(f"Added section for {file_path}")
-        full_guide = f"# {guide_type}\n\n" + "\n\n".join(guide_sections)
-        logger.info("Creating DOCX file")
-        doc = docx.Document()
-        doc.add_heading(guide_type, 0)
-        for line in full_guide.split('\n'):
-            line = line.strip()
-            if line.startswith('# '):
-                doc.add_heading(line[2:], level=1)
-            elif line.startswith('## '):
-                doc.add_heading(line[3:], level=2)
-            elif line.startswith('Step'):
-                doc.add_paragraph(line, style='List Number')
-            else:
-                doc.add_paragraph(line)
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_docx:
-            doc.save(temp_docx.name)
-            docx_path = temp_docx.name
-        logger.info(f"DOCX file saved: {docx_path}")
-        logger.info("Creating Markdown file")
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8') as temp_md:
-            temp_md.write(full_guide)
-            md_path = temp_md.name
-        logger.info(f"Markdown file saved: {md_path}")
-        logger.info("Guide generation completed successfully")
-        return full_guide, docx_path, md_path
     except Exception as e:
-        logger.error(f"An error occurred: {str(e)}", exc_info=True)
-        return f"An error occurred: {str(e)}", None, None
 # App layout
 app.layout = dbc.Container([
-    dbc.Navbar(
-        dbc.Container([
-            html.A(
-                dbc.Row([
-                    dbc.Col(html.Img(src="/assets/logo.png", height="30px")),
-                    dbc.Col(dbc.NavbarBrand("Automated Guide Generator", className="ms-2")),
-                ],
-                align="center",
-                className="g-0",
-                ),
-                href="/",
-                style={"textDecoration": "none"},
-            )
         ]),
-        color="primary",
-        dark=True,
-    ),
-    dbc.Row([
-        dbc.Col([
-            html.H1("Automated Guide Generator", className="text-center my-4"),
-            html.P("Generate a user guide or administration guide based on the UI-related code in a Git repository using Gemini AI. Select a Git provider, enter repository details, choose the guide type, and let AI create a comprehensive guide.", className="text-center mb-4"),
-            dbc.Card([
-                dbc.CardBody([
-                    dbc.Form([
-                        dbc.Select(
-                            id="git-provider",
-                            options=[
-                                {"label": "GitHub", "value": "GitHub"},
-                                {"label": "GitLab", "value": "GitLab"},
-                                {"label": "Gitea", "value": "Gitea"}
-                            ],
-                            placeholder="Select Git Provider",
-                        ),
-                        dbc.Input(id="repo-url", type="text", placeholder="Repository URL (owner/repo)"),
-                        dbc.RadioItems(
-                            id="guide-type",
-                            options=[
-                                {"label": "User Guide", "value": "User Guide"},
-                                {"label": "Administration Guide", "value": "Administration Guide"}
-                            ],
-                            inline=True,
-                        ),
-                        dbc.Input(id="exclude-folders", type="text", placeholder="Exclude Folders (comma-separated)"),
-                        dbc.Button("Generate Guide", id="generate-button", color="primary", className="mt-3"),
-                    ])
-                ])
-            ], className="mb-4"),
-            dbc.Spinner(
-                dbc.Card([
-                    dbc.CardBody([
-                        html.H4("Generated Guide", className="card-title"),
-                        html.Div([
-                            dbc.Button("Download DOCX", id="download-docx", color="secondary", className="me-2"),
-                            dbc.Button("Download Markdown", id="download-md", color="secondary"),
-                        ], className="mt-3"),
-                        dcc.Download(id="download-docx-file"),
-                        dcc.Download(id="download-md-file"),
-                    ])
-                ], className="mt-4"),
-                color="primary",
-            ),
-        ], width=6),
-        dbc.Col([
-            dbc.Card([
-                dbc.CardBody([
-                    html.H4("Preview", className="card-title"),
-                    html.Div(id="generated-guide", style={"whiteSpace": "pre-wrap", "height": "400px", "overflowY": "auto"}),
-                ])
-            ], className="mt-4"),
-        ], width=6),
-    ])
 ], fluid=True)
-@app.callback(
-    [Output("generated-guide", "children"),
-     Output("download-docx", "n_clicks"),
-     Output("download-md", "n_clicks")],
-    [Input("generate-button", "n_clicks")],
-    [State("git-provider", "value"),
-     State("repo-url", "value"),
-     State("guide-type", "value"),
-     State("exclude-folders", "value")]
-)
-def update_output(n_clicks, git_provider, repo_url, guide_type, exclude_folders):
-    if n_clicks is None:
         raise PreventUpdate
-    def generate_guide_thread():
-        nonlocal guide_text, docx_path, md_path
-        guide_text, docx_path, md_path = generate_guide(git_provider, repo_url, guide_type, exclude_folders)
-    guide_text, docx_path, md_path = None, None, None
-    thread = threading.Thread(target=generate_guide_thread)
-    thread.start()
-    thread.join()
-    return guide_text, 0, 0  # Reset n_clicks for download buttons
-@app.callback(
-    Output("download-docx-file", "data"),
-    Input("download-docx", "n_clicks"),
-    prevent_initial_call=True,
-)
-def download_docx(n_clicks):
-    if n_clicks is None:
-        raise PreventUpdate
-    return dcc.send_file(docx_path, filename="generated_guide.docx")
 @app.callback(
-    Output("download-md-file", "data"),
-    Input("download-md", "n_clicks"),
-    prevent_initial_call=True,
 )
-def download_md(n_clicks):
-    if n_clicks is None:
         raise PreventUpdate
-    return dcc.send_file(md_path, filename="generated_guide.md")
 if __name__ == '__main__':
     print("Starting the Dash application...")

 from dash import dcc, html, Input, Output, State
 import dash_bootstrap_components as dbc
 from dash.exceptions import PreventUpdate
+import base64
 import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from fpdf import FPDF
+import re
 import logging
+import asyncio
+import aiohttp
+from aiolimiter import AsyncLimiter
+import sqlite3
+from contextlib import contextmanager
+from threading import local
+import time
+import os
+import ssl
+from io import BytesIO
+import tempfile
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from PyPDF2 import PdfMerger
 # Initialize Dash app
 app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
+server = app.server
+# Logging setup
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Thread-local storage for database connections
+thread_local = local()
+# Rate limiter: 10 requests per second
+rate_limiter = AsyncLimiter(10, 1)
+# Create an SSL context that ignores certificate verification
+ssl_context = ssl.create_default_context()
+ssl_context.check_hostname = False
+ssl_context.verify_mode = ssl.CERT_NONE
+# ThreadPoolExecutor for background tasks
+executor = ThreadPoolExecutor(max_workers=4)
+@contextmanager
+def get_db_connection():
+    if not hasattr(thread_local, "connection"):
+        thread_local.connection = sqlite3.connect('crawl_cache.db')
+    try:
+        yield thread_local.connection
+    finally:
+        pass  # We'll keep the connection open for reuse
+def init_db():
+    with get_db_connection() as conn:
+        c = conn.cursor()
+        c.execute('''CREATE TABLE IF NOT EXISTS pages
+                     (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
+        c.execute('''CREATE INDEX IF NOT EXISTS idx_url ON pages(url)''')
+        conn.commit()
+init_db()
+def clean_text(text):
+    text = ''.join(char for char in text if char.isprintable())
+    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
+    return text
+async def get_page_content(session, url):
+    try:
+        async with rate_limiter:
+            async with session.get(url, timeout=30) as response:
+                if response.status == 200:
+                    text = await response.text()
+                    soup = BeautifulSoup(text, 'html.parser')
+                    content = []
+                    main_content = soup.find('article') or soup.find('main') or soup
+                    if main_content:
+                        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
+                            for element in main_content.find_all(tag):
+                                text = clean_text(element.get_text(strip=True))
+                                if text:
+                                    content.append(text)
+                    logger.info(f"Found {len(content)} content items for {url}")
+                    return content
+                else:
+                    logger.error(f"Error fetching {url}: HTTP {response.status}")
+                    return [f"Error fetching {url}: HTTP {response.status}"]
+    except Exception as e:
+        logger.error(f"Error processing {url}: {str(e)}")
+        return [f"Error processing {url}: {str(e)}"]
+async def get_links(session, url, base_url):
+    try:
+        async with rate_limiter:
+            async with session.get(url, timeout=30) as response:
+                if response.status == 200:
+                    text = await response.text()
+                    soup = BeautifulSoup(text, 'html.parser')
+                    links = soup.find_all('a', href=True)
+                    valid_links = []
+                    for link in links:
+                        full_url = urljoin(url, link['href'])
+                        if full_url.startswith(base_url) and full_url != url:
+                            valid_links.append(full_url)
+                    return valid_links
+                else:
+                    logger.error(f"Error fetching links from {url}: HTTP {response.status}")
+                    return []
+    except Exception as e:
+        logger.error(f"Error getting links from {url}: {str(e)}")
+        return []
+async def crawl_pages(base_url, max_depth):
+    visited = set()
+    to_visit = [(base_url, 0)]
+    all_pages = []
+    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl_context)) as session:
+        while to_visit:
+            current_url, depth = to_visit.pop(0)
+            if current_url in visited or depth > max_depth:
+                continue
+            visited.add(current_url)
+            start_time = time.time()
+            try:
+                with get_db_connection() as conn:
+                    c = conn.cursor()
+                    c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
+                    result = c.fetchone()
+                if result:
+                    content = eval(result[0])  # Convert string back to list
+                else:
+                    content = await get_page_content(session, current_url)
+                    with get_db_connection() as conn:
+                        c = conn.cursor()
+                        c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
+                        conn.commit()
+                all_pages.append((current_url, content))
+                logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
+                if depth < max_depth:
+                    links = await get_links(session, current_url, base_url)
+                    for link in links:
+                        if link not in visited:
+                            to_visit.append((link, depth + 1))
+            except Exception as e:
+                logger.error(f"Error processing {current_url}: {str(e)}")
+                # Continue with the next URL even if this one fails
+    return all_pages
+def generate_pdf_chunk(chunk, output_file):
+    pdf = FPDF()
+    pdf.set_auto_page_break(auto=True, margin=15)
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    for page_url, content in chunk:
+        pdf.cell(0, 10, txt=page_url, ln=True)
+        pdf.ln(5)
+        for text in content:
+            try:
+                pdf.multi_cell(0, 10, txt=text[:200])  # Limit text length to avoid issues
+            except Exception as e:
+                logger.error(f"Error writing text to PDF: {str(e)}")
+        if pdf.get_y() > 250:  # Add a new page if the current page is almost full
+            pdf.add_page()
+    pdf.output(output_file)
+def website_to_pdf(all_pages, progress_callback):
+    logger.info(f"Starting PDF generation for {len(all_pages)} pages")
+    chunk_size = 100
+    total_chunks = (len(all_pages) + chunk_size - 1) // chunk_size
+    temp_files = []
+    with tempfile.TemporaryDirectory() as temp_dir:
+        for i in range(0, len(all_pages), chunk_size):
+            chunk = all_pages[i:i+chunk_size]
+            temp_file = os.path.join(temp_dir, f"chunk_{i}.pdf")
+            generate_pdf_chunk(chunk, temp_file)
+            temp_files.append(temp_file)
+            progress = min((i + chunk_size) / len(all_pages), 1.0)
+            progress_callback(f"Processing pages... {progress:.0%}")
+            logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
+        logger.info("Merging PDF chunks...")
+        output_pdf = os.path.join(temp_dir, "final.pdf")
+        merger = PdfMerger()
+        for temp_file in temp_files:
+            merger.append(temp_file)
+        merger.write(output_pdf)
+        merger.close()
+        logger.info("PDF generation complete. Reading final PDF...")
+        with open(output_pdf, 'rb') as f:
+            return f.read()
+async def process_url(url, depth, progress_callback):
     try:
+        all_pages = await crawl_pages(url, depth)
+        if not all_pages:
+            return "No pages were successfully crawled. Please check the URL and try again."
+        logger.info("Crawling complete. Starting PDF generation...")
+        # Use ThreadPoolExecutor to run PDF generation in a separate thread
+        loop = asyncio.get_event_loop()
+        pdf_content = await loop.run_in_executor(executor, website_to_pdf, all_pages, progress_callback)
+        logger.info("PDF generation complete.")
+        return pdf_content
     except Exception as e:
+        logger.error(f"Error in process_url: {str(e)}")
+        return f"An error occurred: {str(e)}"
 # App layout
 app.layout = dbc.Container([
+    dcc.Store(id='pdf-store'),
+    dcc.Store(id='progress-store'),
+    dbc.Card(
+        dbc.CardBody([
+            html.H1("Website to PDF Converter", className="text-center mb-4"),
+            html.P("Enter docs URL and crawl depth to convert documentation pages into a PDF. Be responsible for sites you have permission to do this", className="text-center mb-4"),
+            dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
+            dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
+            dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
+            dbc.Button("Download PDF", id="download-button", color="secondary", className="mb-3 w-100", disabled=True),
+            html.Div([
+                dbc.Spinner(html.Div(id="progress-message"), color="primary", type="grow", size="lg"),
+            ], className="text-center mb-3"),
+            dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
+            dcc.Download(id="download-pdf")
         ]),
+        className="mt-4"
+    )
 ], fluid=True)
+def update_output(n_clicks, n_intervals, progress_data, url, depth):
+    ctx = dash.callback_context
+    if not ctx.triggered:
         raise PreventUpdate
+    triggered_id = ctx.triggered[0]['prop_id'].split('.')[0]
+    if triggered_id == "submit-button":
+        if not url:
+            return True, "secondary", True, None, "Please enter a URL"
+        # Start the background task
+        task_id = str(uuid.uuid4())
+        executor.submit(background_task, url, depth, task_id)
+        return True, "secondary", False, None, "Processing... Please wait."
+    elif triggered_id == "progress-interval" or triggered_id == "progress-store":
+        if progress_data is None:
+            return True, "secondary", False, None, "Processing... Please wait."
+        if isinstance(progress_data, str):
+            if progress_data.startswith("Error"):
+                return True, "secondary", True, None, progress_data
+            else:
+                return True, "secondary", False, None, progress_data
+        if isinstance(progress_data, bytes):
+            encoded = base64.b64encode(progress_data).decode()
+            return False, "primary", True, encoded, "PDF ready for download!"
+    return True, "secondary", False, None, ""
 @app.callback(
+    Output("download-pdf", "data"),
+    Input("download-button", "n_clicks"),
+    State("pdf-store", "data"),
+    prevent_initial_call=True
 )
+def download_pdf(n_clicks, pdf_data):
+    if pdf_data is None:
         raise PreventUpdate
+    decoded = base64.b64decode(pdf_data)
+    return dcc.send_bytes(decoded, f"website_content_{int(time.time())}.pdf")
+def background_task(url, depth, task_id):
+    def progress_callback(message):
+        # Update progress in the progress-store
+        app.layout.children[1].data = message
+    try:
+        logger.info(f"Starting background task for URL: {url}, depth: {depth}")
+        pdf_content = asyncio.run(process_url(url, depth, progress_callback))
+        logger.info("Background task completed successfully")
+        # Store the PDF content directly in the progress-store
+        app.layout.children[1].data = pdf_content
+    except Exception as e:
+        logger.error(f"Error in background task: {str(e)}")
+        app.layout.children[1].data = f"Error: {str(e)}"
 if __name__ == '__main__':
     print("Starting the Dash application...")