Spaces:

Tngarg
/

Website_intelligent_report

Sleeping

App Files Files Community

Tngarg commited on Jul 4, 2025

Commit

db16232

verified ·

1 Parent(s): 2ee7f94

Upload 12 files

Browse files

Files changed (12) hide show

app.py +5 -0
chat_wrapper.py +32 -0
pdf_exporter.py +26 -0
performance_agent.py +63 -0
pipeline.py +46 -0
report_generator.py +104 -0
report_prompt.txt +13 -0
requirements.txt +16 -0
scraper_agent.py +46 -0
summary_prompt.txt +16 -0
ui.py +120 -0
ui_ux_prompt.txt +18 -0

app.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# app.py
+from ui import launch_ui
+if __name__ == "__main__":
+    launch_ui()

chat_wrapper.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# chat_model_wrapper.py
+import os
+import google.generativeai as genai
+from dotenv import load_dotenv
+load_dotenv(override=True)
+class ChatRefiner:
+    def __init__(self, model_name: str = "gemini-1.5-flash"):
+        """
+        Wraps the Gemini model for prompt→text generation.
+        """
+        api_key = os.getenv("GEMINI_API_KEY")
+        api_key="AIzaSyBzRMrMaL-s9y-8PJURGuMS3iH-3uuCyqs"
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY not set in environment or .env file.")
+        genai.configure(api_key=api_key)
+        self.model = genai.GenerativeModel(model_name)
+    def answer(self, prompt: str) -> str:
+        """
+        Sends `prompt` to Gemini and returns the generated text.
+        """
+        try:
+            response = self.model.generate_content(prompt)
+            return response.text.strip()
+        except Exception as e:
+            # Log or handle as you prefer
+            print(f"[ChatRefiner.answer] Error: {e}")
+            raise

pdf_exporter.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+import pdfkit
+from dotenv import load_dotenv
+load_dotenv(override = True)
+class PDFExporter:
+    def __init__(self):
+        self.output_dir = os.getenv('OUTPUT_DIR', './out')
+        os.makedirs(self.output_dir, exist_ok=True)
+        # Optional: Load wkhtmltopdf path from env if not in system PATH
+        wkhtml_path = os.getenv("WKHTMLTOPDF_PATH")
+        if wkhtml_path and not os.path.isfile(wkhtml_path):
+            raise FileNotFoundError(f"wkhtmltopdf not found at: {wkhtml_path}")
+        self.config = pdfkit.configuration(wkhtmltopdf=wkhtml_path) if wkhtml_path else None
+    def export(self, html: str, name: str = "report") -> str:
+        out_path = os.path.join(self.output_dir, f"{name}.pdf")
+        try:
+            pdfkit.from_string(html, out_path, configuration=self.config)
+            return out_path
+        except Exception as e:
+            print(f"[PDFExporter.export] PDF generation failed: {e}")
+            raise

performance_agent.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# performance_agent.py
+import os
+import requests
+class PerformanceAgent:
+    """
+    Wraps the PageSpeed Insights API to fetch
+    Lighthouse scores for a given URL.
+    On any network/API failure, returns all-100% defaults.
+    """
+    EXPECTED_CATEGORIES = ("performance", "accessibility", "best-practices", "seo")
+    def __init__(self):
+        self.api_key  = os.getenv("PAGESPEED_API_KEY","AIzaSyAKyKYao-FVDmhqcwLojv6DDl7nALjwghg")
+        self.endpoint = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
+        if not self.api_key:
+            raise ValueError("PAGESPEED_API_KEY not set in .env")
+    def fetch_performance(self, url: str, strategy: str = "mobile") -> dict:
+        params = {
+            "url": url,
+            "strategy": strategy,  # "mobile" or "desktop"
+            "key": self.api_key
+        }
+        try:
+            resp = requests.get(self.endpoint, params=params, timeout=30)
+            resp.raise_for_status()
+            data   = resp.json()
+            lh     = data.get("lighthouseResult", {})
+            cats   = lh.get("categories", {})
+            audits = lh.get("audits", {})
+            # Build scores, default missing/unparsable → 100.0
+            scores = {}
+            for cat in self.EXPECTED_CATEGORIES:
+                raw = cats.get(cat, {}).get("score")
+                if isinstance(raw, (int, float)):
+                    scores[cat] = round(raw * 100, 1)
+                else:
+                    scores[cat] = 100.0
+            # Collect audits < 100% for suggestions
+            audit_suggestions = {}
+            for audit_id, info in audits.items():
+                sc = info.get("score")
+                if isinstance(sc, (int, float)) and sc < 1:
+                    audit_suggestions[audit_id] = info.get("displayValue", "").strip()
+            return {
+                "scores": scores,
+                "audit_suggestions": audit_suggestions
+            }
+        except requests.exceptions.RequestException as e:
+            # Log the error and return an all-100% default, with no suggestions
+            print(f"[PerformanceAgent] PSI API error: {e}")
+            return {
+                "scores": {cat: 100.0 for cat in self.EXPECTED_CATEGORIES},
+                "audit_suggestions": {}
+            }

pipeline.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import asyncio
+from scraper_agent import ScraperAgent
+from seo_agent.seo_agent import SEOAgent
+from performance_agent import PerformanceAgent
+from report_generator import ReportGenerator
+# from pdf_exporter import PDFExporter
+class Pipeline:
+    def __init__(self):
+        self.scraper = ScraperAgent()
+        self.seo     = SEOAgent()
+        self.perf    = PerformanceAgent()
+        self.report  = ReportGenerator()
+        # self.pdf     = PDFExporter()
+    async def run(self, url: str):
+        yield "🔍 Scraping URL and Fetching Performance", None
+        # Run scraping + performance in parallel
+        page_task = asyncio.to_thread(self.scraper.fetch, url)
+        perf_task = asyncio.to_thread(self.perf.fetch_performance, url)
+        page, perf_data = await asyncio.gather(page_task, perf_task)
+        yield f"✔️ Title: {page['title']!r}", None
+        yield "✅ Performance metrics ready", perf_data
+        # Run SEO + Image in parallel (after scraping)
+        yield "⚙️ Running SEO & Image analysis…", None
+        seo_task = asyncio.to_thread(self.seo.analyze_seo, page["html"], page["text"])
+        img_task = asyncio.to_thread(self.seo.analyze_images, page["images"], page["text"])
+        seo_report, img_report = await asyncio.gather(seo_task, img_task)
+        yield "✅ SEO analysis complete", seo_report
+        yield "✅ Image analysis complete", img_report
+        # Generate report (sequential)
+        yield "📝 Generating report HTML…", None
+        html = self.report.build(url, str(perf_data), seo_report, img_report)
+        yield "✅ Report HTML ready", html
+        # Export PDF
+        # yield "📄 Exporting to PDF…", None
+        # pdf_path = self.pdf.export(html, name="final_report")
+        # yield "🎉 Done!", pdf_path
+        self.scraper.close()

report_generator.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+from datetime import datetime
+from jinja2 import Environment, FileSystemLoader, select_autoescape
+import markdown
+from chat_wrapper import ChatRefiner
+class ReportGenerator:
+    def __init__(self, template_path: str = None, prompt_path: str = None):
+        # Default template path
+        self.template_path = template_path or os.getenv("TEMPLATE_PATH", "./templates/report.html.j2")
+        if not os.path.isfile(self.template_path):
+            raise FileNotFoundError(f"Report template not found: {self.template_path}")
+        # Load Jinja2 environment
+        tpl_dir = os.path.dirname(self.template_path)
+        tpl_file = os.path.basename(self.template_path)
+        self.env = Environment(
+            loader=FileSystemLoader(tpl_dir),
+            autoescape=select_autoescape(['html', 'xml'])
+        )
+        self.template = self.env.get_template(tpl_file)
+        # Load refinement prompt
+        prompt_path = prompt_path or os.getenv("PROMPT_PATH", "./report_prompt.txt")
+        if not os.path.isfile(prompt_path):
+            raise FileNotFoundError(f"Refinement prompt template not found: {prompt_path}")
+        with open(prompt_path, encoding="utf-8") as f:
+            self.refine_prompt_template = f.read()
+        # Load additional prompts for summary and UX/UI
+        self.overall_summary_prompt = self._load_prompt("./summary_prompt.txt")
+        self.ux_summary_prompt      = self._load_prompt("./ui_ux_prompt.txt")
+        # Initialize Gemini wrapper
+        self.chat = ChatRefiner(model_name="gemini-1.5-flash")
+    def _load_prompt(self, path: str) -> str:
+        if not os.path.isfile(path):
+            raise FileNotFoundError(f"Prompt file not found: {path}")
+        with open(path, encoding="utf-8") as f:
+            return f.read()
+    def refine_section(self, raw_text: str) -> str:
+        """
+        Uses Gemini to convert raw dict-like or prose analysis into markdown-style summary.
+        """
+        prompt = self.refine_prompt_template.replace("${raw}", raw_text)
+        try:
+            return self.chat.answer(prompt)
+        except Exception as e:
+            print(f"[ReportGenerator.refine_section] Error during refinement: {e}")
+            return raw_text  # fallback to raw
+    def generate_summaries(self, full_html: str, image_html: str) -> tuple[str, str]:
+        """
+        Uses Gemini to generate:
+          - A human-readable overall summary
+          - A UI/UX & images analysis summary
+        """
+        try:
+            overall_prompt = self.overall_summary_prompt.replace("${html}", full_html)
+            ux_prompt = self.ux_summary_prompt.replace("${html}", image_html)
+            print('full',full_html)
+            print('ux_prompt',image_html)
+            overall_md = self.chat.answer(overall_prompt)
+            ux_md = self.chat.answer(ux_prompt)
+            overall_html = markdown.markdown(overall_md)
+            ux_html = markdown.markdown(ux_md)
+            return overall_html, ux_html
+        except Exception as e:
+            print(f"[generate_summaries] Error: {e}")
+            return "", ""
+    def build(self, url: str, perf_text: str, seo_text: str, img_text: str) -> str:
+        """
+        Main method to convert raw analysis text into clean HTML report.
+        Each section is refined via Gemini and rendered via markdown → Jinja2.
+        """
+        # Refine via Gemini
+        refined_perf = self.refine_section(perf_text)
+        refined_seo  = self.refine_section(seo_text)
+        refined_img  = self.refine_section(img_text)
+        # Convert to HTML using markdown
+        perf_html = markdown.markdown(refined_perf, extensions=["tables", "fenced_code"])
+        seo_html  = markdown.markdown(refined_seo,  extensions=["tables", "fenced_code"])
+        img_html  = markdown.markdown(refined_img,  extensions=["tables", "fenced_code"])
+        # Generate summaries using refined HTML
+        full_html = perf_html + "\n\n" + seo_html + "\n\n" + img_html
+        overall_summary_html, ux_ui_html = self.generate_summaries(full_html, img_html)
+        # Inject into template
+        return self.template.render(
+            url=url,
+            timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            summary=overall_summary_html,
+            ux_summary=ux_ui_html,
+            performance=perf_html,
+            seo=seo_html,
+            images=img_html
+        )

report_prompt.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+You are a frontend engineer and technical writer.
+Your task is to convert the following raw diagnostic report into structured, clean text using markdown style. Specifically:
+- If the text contains JSON or dicts, extract the key values and convert them into readable bullet points.
+- Group data logically under subheadings (use `###`).
+- Bold key numbers or metrics using `**bold**`.
+- If the text is prose, clean it up for readability.
+- Do not include raw JSON blocks or code fences.
+=== BEGIN RAW REPORT ===
+${raw}
+=== END RAW REPORT ===

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+python-dotenv
+google-cloud-aiplatform
+google-generativeai
+gradio
+jinja2
+pdfkit
+requests
+# scraping enhancements:
+beautifulsoup4
+pandas
+langchain-community
+markdown

scraper_agent.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# scraper_agent.py
+import os
+import time
+from dotenv import load_dotenv
+import requests
+from bs4 import BeautifulSoup
+load_dotenv(override=True)
+class ScraperAgent:
+    def __init__(self):
+        self.user_agent = os.getenv(
+            "SCRAPER_USER_AGENT",
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+            "(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
+        )
+        self.timeout = int(os.getenv("SCRAPER_TIMEOUT", "10"))
+        self.delay   = float(os.getenv("SCRAPER_DELAY", "0.5"))
+    def fetch(self, url: str) -> dict:
+        headers = {"User-Agent": self.user_agent}
+        resp = requests.get(url, headers=headers, timeout=self.timeout)
+        resp.raise_for_status()
+        html = resp.text
+        soup = BeautifulSoup(html, "html.parser")
+        images = [img["src"] for img in soup.find_all("img", src=True)]
+        body   = soup.body.get_text("\n", strip=True) if soup.body else ""
+        time.sleep(self.delay)
+        return {
+            "title": soup.title.string if soup.title else "",
+            "html": html,
+            "images": images,
+            "text": body
+        }
+    def close(self):
+        """
+        Clean up any resources.
+        No-op for requests-based scraper,
+        but lets pipeline always call scraper.close().
+        """
+        pass

summary_prompt.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+You are a friendly website audit assistant writing for a non-technical business audience.
+Given the following website report sections (on performance, SEO, and images), write a short and clear **Overall Summary** that highlights:
+- What’s working well
+- What needs the most improvement
+- Any major concerns to pay attention to
+Avoid technical terms like "LCP" or "CLS" — instead, say things like “the site loads quickly” or “some images are too large.”
+Keep it simple, helpful, and friendly. Imagine you’re explaining this to a founder or project manager with no technical background. The output should be desriptive report style.
+---
+### Website Summary HTML:
+${html}

ui.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import gradio as gr
+from pipeline import Pipeline
+import asyncio
+pipeline = Pipeline()
+# Define a light, pastel color theme inspired by modern web design
+theme = gr.themes.Soft(
+    primary_hue=gr.themes.colors.blue,
+    secondary_hue=gr.themes.colors.sky,
+    neutral_hue=gr.themes.colors.slate,
+    font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
+).set(
+    # Body and background colors
+    body_background_fill="#f0f9ff",  # A very light pastel blue
+    body_background_fill_dark="#020617",
+    # Card/Block styles
+    block_background_fill="white",
+    block_border_width="1px",
+    block_shadow="*shadow_drop_lg",
+    block_radius="*radius_xl",
+    # Button styles
+    button_primary_background_fill="*primary_500",
+    button_primary_background_fill_hover="*primary_600",
+    button_primary_text_color="white",
+)
+# Custom CSS for finer control over the layout
+custom_css = """
+/* Make the container wider for a better desktop experience */
+.gradio-container {
+    max-width: 90% !important;
+}
+/* Vertically align the button with the textbox */
+.input-row {
+    align-items: center;
+}
+/* Add spacing between status box and report preview */
+.gr-block > .gr-row > *:not(:last-child) {
+    margin-right: 2rem;
+}
+"""
+async def analyze(url: str):
+    if not url:
+        yield gr.update(value="❌ Please enter a URL."), gr.update(value="")
+        return
+    html_output = ""
+    async for status, payload in pipeline.run(url):
+        if payload and isinstance(payload, str) and payload.strip().startswith("<!DOCTYPE html>"):
+            html_output = payload
+        yield gr.update(value=status), gr.update(value=html_output)
+def launch_ui():
+    """
+    Launches the Gradio interface with the new pastel theme and improved layout.
+    """
+    with gr.Blocks(theme=theme, css=custom_css, title="Website Analyzer") as demo:
+        # Main Title and Subtitle
+        gr.Markdown(
+            """
+            <div style="text-align: center; padding: 2rem 0;">
+                <h1 style="font-size: 2.8rem; font-weight: 700; color: #1e3a8a;">📊 Website Intelligence Report</h1>
+                <p style="color: #475569; font-size: 1.1rem;">Enter any URL to generate a comprehensive analysis of its performance, SEO, and layout.</p>
+            </div>
+            """
+        )
+        # Input Row: URL Textbox and Analyze Button
+        with gr.Row(elem_classes="input-row"):
+            url_in = gr.Textbox(
+                label="Webpage URL",
+                placeholder="e.g., https://www.example.com",
+                scale=4,  # Give more width to the textbox
+                container=False # Remove the container for better alignment
+            )
+            analyze_btn = gr.Button(
+                "Analyze 🚀",
+                variant="primary",
+                scale=1,  # Give less width to the button
+            )
+        # Output Sections
+        status_out = gr.Textbox(
+            label="Analysis Progress",
+            interactive=False,
+            lines=1,
+            scale=1,
+            placeholder="Status updates will appear here...",
+        )
+        html_preview = gr.HTML(
+            label="Final Report Preview"
+            # scale=3
+        )
+        gr.Markdown(
+            "--- \n"
+            "<p style='text-align:center; color: #6b7280;'>💡 After analysis, use the <strong>Print Report</strong> button inside the preview to save as a PDF.</p>"
+        )
+        # Button Click Action
+        analyze_btn.click(
+            fn=analyze,
+            inputs=[url_in],
+            outputs=[status_out, html_preview],
+            show_progress="full",
+            concurrency_limit=1,
+        )
+    demo.launch(share=True)
+if __name__ == "__main__":
+    launch_ui()

ui_ux_prompt.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+You are a website experience reviewer writing for a non-technical client.
+Using the image and layout section of this report, write a short and friendly **UX & Design Summary**.
+Please include:
+- Any problems with how the page looks or feels (like clutter, spacing, readability)
+- Notes on how images are used (e.g., missing descriptions, too slow to load, poor quality)
+- Suggestions for making the experience smoother on desktop or mobile
+Avoid tech jargon. Use plain language like “some images don’t show descriptions” or “the layout could feel cleaner.”
+Imagine you’re writing for someone who owns the website and just wants to know what to fix for better user experience.
+---
+### Image & Layout HTML Section:
+${html}