import streamlit as st
from scrapegraphai.graphs import SmartScraperGraph
from transformers import pipeline
import torch
import json
from typing import Dict, Any, Union
from jsonschema import validate, ValidationError

# JSON schema for result validation
RESULT_SCHEMA = {
    "type": "object",
    "properties": {
        "description": {"type": "string"},
        "founders": {"type": "array"},
        "social_media_links": {"type": "array"},
        "academic_programs": {"type": "object"},
        "key_features": {"type": "array"}
    },
    "required": ["description", "founders", "social_media_links", "academic_programs", "key_features"]
}

def main() -> None:
    """Main application interface"""
    st.set_page_config(page_title="Web Scraper", layout="wide")
    st.title("🔍 Smart Web Scraping Assistant")
    
    with st.container():
        url = st.text_input("Enter website URL:", "https://www.sliit.lk/")
        
        if st.button("Start Scraping", type="primary"):
            with st.status("Processing...", expanded=True) as status:
                try:
                    result = run_scraping_pipeline(url)
                    if "error" in result:
                        st.error(f"🚨 Error: {result['error']}")
                    else:
                        status.update(label="Processing complete!", state="complete")
                        display_scraping_results(result)
                except Exception as e:
                    st.error(f"Critical failure: {str(e)}")

def display_scraping_results(data: Dict[str, Any]) -> None:
    """Display structured scraping results"""
    col1, col2 = st.columns([3, 2])
    
    with col1:
        st.subheader("Structured Data")
        st.markdown(f"**Description:** {data.get('description', 'N/A')}")
        
        display_section("🏢 Founders", data.get("founders", []))
        display_section("📱 Social Media", data.get("social_media_links", []))
        display_section("🎓 Academic Programs", list(data.get("academic_programs", {}).items()))
        display_section("⭐ Key Features", data.get("key_features", []))
    
    with col2:
        st.subheader("Raw JSON Output")
        st.code(json.dumps(data, indent=2), language="json")

def display_section(title: str, items: list) -> None:
    """Render a section with list items"""
    if items:
        st.markdown(f"**{title}**")
        for item in items:
            st.markdown(f"- {str(item)}")

def initialize_text_generation_pipeline():
    """
    Initialize a text generation pipeline with specific configurations.
    
    This method creates a pipeline that:
    - Uses the DeepSeek model
    - Configured for text generation
    - Optimized for JSON-like outputs
    """
    return pipeline(
        "text-generation",
        model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        return_full_text=False,
        max_new_tokens=1024,
        temperature=0.1,
        top_p=0.9,
        repetition_penalty=1.1
    )

def run_scraping_pipeline(url: str) -> Dict[str, Any]:
    """Execute the complete scraping workflow"""
    try:
        # Initialize text generation pipeline
        text_pipeline = initialize_text_generation_pipeline()
        
        # Configure scraping graph
        scraper_config = {
            "llm": {
                "model": text_pipeline,
                "model_tokens": 8192,
                "json_mode": True,
                "response_format": {"type": "json_object"}
            },
            "verbose": False,
            "headless": True
        }

        # Initialize scraper
        scraper = SmartScraperGraph(
            prompt="""Extract structured information as JSON with:
            - Company description (string)
            - Founders (array of names)
            - Social media links (array of URLs)
            - Academic programs (object with program types)
            - Key features (array of strings)

            Return ONLY valid JSON using this structure. Ensure the JSON is well-formed:
            {
                "description": "...",
                "founders": [],
                "social_media_links": [],
                "academic_programs": {},
                "key_features": []
            }""",
            source=url,
            config=scraper_config
        )

        # Execute scraping
        raw_result = scraper.run()
        return process_scraping_result(raw_result)

    except ValidationError as ve:
        return {"error": f"Validation error: {str(ve)}"}
    except json.JSONDecodeError as je:
        return {"error": f"JSON parsing error: {str(je)}"}
    except Exception as e:
        return {"error": f"Scraping failed: {str(e)}"}

def process_scraping_result(raw_data: Union[str, Dict]) -> Dict[str, Any]:
    """Process and validate scraping results"""
    # Convert to dict if needed
    if isinstance(raw_data, str):
        try:
            parsed_data = json.loads(raw_data)
        except json.JSONDecodeError:
            parsed_data = attempt_json_recovery(raw_data)
    else:
        parsed_data = raw_data

    # Validate against schema
    validate(instance=parsed_data, schema=RESULT_SCHEMA)
    
    # Type conversions
    parsed_data["founders"] = list(map(str, parsed_data["founders"]))
    parsed_data["social_media_links"] = [str(link) for link in parsed_data["social_media_links"]]
    
    return parsed_data

def attempt_json_recovery(raw_str: str, max_attempts: int = 3) -> Dict:
    """Attempt to fix malformed JSON strings"""
    for attempt in range(max_attempts):
        try:
            # Try cleaning common formatting issues
            cleaned = raw_str.strip()
            cleaned = cleaned.replace("```json", "").replace("```", "")
            return json.loads(cleaned)
        except json.JSONDecodeError:
            continue
    raise json.JSONDecodeError("Failed to recover JSON after multiple attempts")

if __name__ == "__main__":
    main()