Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from scrapegraphai.graphs import SmartScraperGraph | |
| from transformers import pipeline | |
| import torch | |
| import json | |
| from typing import Dict, Any, Union | |
| from jsonschema import validate, ValidationError | |
| # JSON schema for result validation | |
| RESULT_SCHEMA = { | |
| "type": "object", | |
| "properties": { | |
| "description": {"type": "string"}, | |
| "founders": {"type": "array"}, | |
| "social_media_links": {"type": "array"}, | |
| "academic_programs": {"type": "object"}, | |
| "key_features": {"type": "array"} | |
| }, | |
| "required": ["description", "founders", "social_media_links", "academic_programs", "key_features"] | |
| } | |
| def main() -> None: | |
| """Main application interface""" | |
| st.set_page_config(page_title="Web Scraper", layout="wide") | |
| st.title("π Smart Web Scraping Assistant") | |
| with st.container(): | |
| url = st.text_input("Enter website URL:", "https://www.sliit.lk/") | |
| if st.button("Start Scraping", type="primary"): | |
| with st.status("Processing...", expanded=True) as status: | |
| try: | |
| result = run_scraping_pipeline(url) | |
| if "error" in result: | |
| st.error(f"π¨ Error: {result['error']}") | |
| else: | |
| status.update(label="Processing complete!", state="complete") | |
| display_scraping_results(result) | |
| except Exception as e: | |
| st.error(f"Critical failure: {str(e)}") | |
| def display_scraping_results(data: Dict[str, Any]) -> None: | |
| """Display structured scraping results""" | |
| col1, col2 = st.columns([3, 2]) | |
| with col1: | |
| st.subheader("Structured Data") | |
| st.markdown(f"**Description:** {data.get('description', 'N/A')}") | |
| display_section("π’ Founders", data.get("founders", [])) | |
| display_section("π± Social Media", data.get("social_media_links", [])) | |
| display_section("π Academic Programs", list(data.get("academic_programs", {}).items())) | |
| display_section("β Key Features", data.get("key_features", [])) | |
| with col2: | |
| st.subheader("Raw JSON Output") | |
| st.code(json.dumps(data, indent=2), language="json") | |
| def display_section(title: str, items: list) -> None: | |
| """Render a section with list items""" | |
| if items: | |
| st.markdown(f"**{title}**") | |
| for item in items: | |
| st.markdown(f"- {str(item)}") | |
| def initialize_text_generation_pipeline(): | |
| """ | |
| Initialize a text generation pipeline with specific configurations. | |
| This method creates a pipeline that: | |
| - Uses the DeepSeek model | |
| - Configured for text generation | |
| - Optimized for JSON-like outputs | |
| """ | |
| return pipeline( | |
| "text-generation", | |
| model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", | |
| device_map="auto", | |
| torch_dtype=torch.float16, | |
| trust_remote_code=True, | |
| return_full_text=False, | |
| max_new_tokens=1024, | |
| temperature=0.1, | |
| top_p=0.9, | |
| repetition_penalty=1.1 | |
| ) | |
| def run_scraping_pipeline(url: str) -> Dict[str, Any]: | |
| """Execute the complete scraping workflow""" | |
| try: | |
| # Initialize text generation pipeline | |
| text_pipeline = initialize_text_generation_pipeline() | |
| # Configure scraping graph | |
| scraper_config = { | |
| "llm": { | |
| "model": text_pipeline, | |
| "model_tokens": 8192, | |
| "json_mode": True, | |
| "response_format": {"type": "json_object"} | |
| }, | |
| "verbose": False, | |
| "headless": True | |
| } | |
| # Initialize scraper | |
| scraper = SmartScraperGraph( | |
| prompt="""Extract structured information as JSON with: | |
| - Company description (string) | |
| - Founders (array of names) | |
| - Social media links (array of URLs) | |
| - Academic programs (object with program types) | |
| - Key features (array of strings) | |
| Return ONLY valid JSON using this structure. Ensure the JSON is well-formed: | |
| { | |
| "description": "...", | |
| "founders": [], | |
| "social_media_links": [], | |
| "academic_programs": {}, | |
| "key_features": [] | |
| }""", | |
| source=url, | |
| config=scraper_config | |
| ) | |
| # Execute scraping | |
| raw_result = scraper.run() | |
| return process_scraping_result(raw_result) | |
| except ValidationError as ve: | |
| return {"error": f"Validation error: {str(ve)}"} | |
| except json.JSONDecodeError as je: | |
| return {"error": f"JSON parsing error: {str(je)}"} | |
| except Exception as e: | |
| return {"error": f"Scraping failed: {str(e)}"} | |
| def process_scraping_result(raw_data: Union[str, Dict]) -> Dict[str, Any]: | |
| """Process and validate scraping results""" | |
| # Convert to dict if needed | |
| if isinstance(raw_data, str): | |
| try: | |
| parsed_data = json.loads(raw_data) | |
| except json.JSONDecodeError: | |
| parsed_data = attempt_json_recovery(raw_data) | |
| else: | |
| parsed_data = raw_data | |
| # Validate against schema | |
| validate(instance=parsed_data, schema=RESULT_SCHEMA) | |
| # Type conversions | |
| parsed_data["founders"] = list(map(str, parsed_data["founders"])) | |
| parsed_data["social_media_links"] = [str(link) for link in parsed_data["social_media_links"]] | |
| return parsed_data | |
| def attempt_json_recovery(raw_str: str, max_attempts: int = 3) -> Dict: | |
| """Attempt to fix malformed JSON strings""" | |
| for attempt in range(max_attempts): | |
| try: | |
| # Try cleaning common formatting issues | |
| cleaned = raw_str.strip() | |
| cleaned = cleaned.replace("```json", "").replace("```", "") | |
| return json.loads(cleaned) | |
| except json.JSONDecodeError: | |
| continue | |
| raise json.JSONDecodeError("Failed to recover JSON after multiple attempts") | |
| if __name__ == "__main__": | |
| main() |