import streamlit as st from scrapegraphai.graphs import SmartScraperGraph from transformers import pipeline import torch import json from typing import Dict, Any, Union from jsonschema import validate, ValidationError # JSON schema for result validation RESULT_SCHEMA = { "type": "object", "properties": { "description": {"type": "string"}, "founders": {"type": "array"}, "social_media_links": {"type": "array"}, "academic_programs": {"type": "object"}, "key_features": {"type": "array"} }, "required": ["description", "founders", "social_media_links", "academic_programs", "key_features"] } def main() -> None: """Main application interface""" st.set_page_config(page_title="Web Scraper", layout="wide") st.title("🔍 Smart Web Scraping Assistant") with st.container(): url = st.text_input("Enter website URL:", "https://www.sliit.lk/") if st.button("Start Scraping", type="primary"): with st.status("Processing...", expanded=True) as status: try: result = run_scraping_pipeline(url) if "error" in result: st.error(f"🚨 Error: {result['error']}") else: status.update(label="Processing complete!", state="complete") display_scraping_results(result) except Exception as e: st.error(f"Critical failure: {str(e)}") def display_scraping_results(data: Dict[str, Any]) -> None: """Display structured scraping results""" col1, col2 = st.columns([3, 2]) with col1: st.subheader("Structured Data") st.markdown(f"**Description:** {data.get('description', 'N/A')}") display_section("🏢 Founders", data.get("founders", [])) display_section("📱 Social Media", data.get("social_media_links", [])) display_section("🎓 Academic Programs", list(data.get("academic_programs", {}).items())) display_section("⭐ Key Features", data.get("key_features", [])) with col2: st.subheader("Raw JSON Output") st.code(json.dumps(data, indent=2), language="json") def display_section(title: str, items: list) -> None: """Render a section with list items""" if items: st.markdown(f"**{title}**") for item in items: st.markdown(f"- {str(item)}") def initialize_text_generation_pipeline(): """ Initialize a text generation pipeline with specific configurations. This method creates a pipeline that: - Uses the DeepSeek model - Configured for text generation - Optimized for JSON-like outputs """ return pipeline( "text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True, return_full_text=False, max_new_tokens=1024, temperature=0.1, top_p=0.9, repetition_penalty=1.1 ) def run_scraping_pipeline(url: str) -> Dict[str, Any]: """Execute the complete scraping workflow""" try: # Initialize text generation pipeline text_pipeline = initialize_text_generation_pipeline() # Configure scraping graph scraper_config = { "llm": { "model": text_pipeline, "model_tokens": 8192, "json_mode": True, "response_format": {"type": "json_object"} }, "verbose": False, "headless": True } # Initialize scraper scraper = SmartScraperGraph( prompt="""Extract structured information as JSON with: - Company description (string) - Founders (array of names) - Social media links (array of URLs) - Academic programs (object with program types) - Key features (array of strings) Return ONLY valid JSON using this structure. Ensure the JSON is well-formed: { "description": "...", "founders": [], "social_media_links": [], "academic_programs": {}, "key_features": [] }""", source=url, config=scraper_config ) # Execute scraping raw_result = scraper.run() return process_scraping_result(raw_result) except ValidationError as ve: return {"error": f"Validation error: {str(ve)}"} except json.JSONDecodeError as je: return {"error": f"JSON parsing error: {str(je)}"} except Exception as e: return {"error": f"Scraping failed: {str(e)}"} def process_scraping_result(raw_data: Union[str, Dict]) -> Dict[str, Any]: """Process and validate scraping results""" # Convert to dict if needed if isinstance(raw_data, str): try: parsed_data = json.loads(raw_data) except json.JSONDecodeError: parsed_data = attempt_json_recovery(raw_data) else: parsed_data = raw_data # Validate against schema validate(instance=parsed_data, schema=RESULT_SCHEMA) # Type conversions parsed_data["founders"] = list(map(str, parsed_data["founders"])) parsed_data["social_media_links"] = [str(link) for link in parsed_data["social_media_links"]] return parsed_data def attempt_json_recovery(raw_str: str, max_attempts: int = 3) -> Dict: """Attempt to fix malformed JSON strings""" for attempt in range(max_attempts): try: # Try cleaning common formatting issues cleaned = raw_str.strip() cleaned = cleaned.replace("```json", "").replace("```", "") return json.loads(cleaned) except json.JSONDecodeError: continue raise json.JSONDecodeError("Failed to recover JSON after multiple attempts") if __name__ == "__main__": main()