rodandegulle's picture
Update app.py
8099efd verified
import streamlit as st
from scrapegraphai.graphs import SmartScraperGraph
from transformers import pipeline
import torch
import json
from typing import Dict, Any, Union
from jsonschema import validate, ValidationError
# JSON schema for result validation
RESULT_SCHEMA = {
"type": "object",
"properties": {
"description": {"type": "string"},
"founders": {"type": "array"},
"social_media_links": {"type": "array"},
"academic_programs": {"type": "object"},
"key_features": {"type": "array"}
},
"required": ["description", "founders", "social_media_links", "academic_programs", "key_features"]
}
def main() -> None:
"""Main application interface"""
st.set_page_config(page_title="Web Scraper", layout="wide")
st.title("πŸ” Smart Web Scraping Assistant")
with st.container():
url = st.text_input("Enter website URL:", "https://www.sliit.lk/")
if st.button("Start Scraping", type="primary"):
with st.status("Processing...", expanded=True) as status:
try:
result = run_scraping_pipeline(url)
if "error" in result:
st.error(f"🚨 Error: {result['error']}")
else:
status.update(label="Processing complete!", state="complete")
display_scraping_results(result)
except Exception as e:
st.error(f"Critical failure: {str(e)}")
def display_scraping_results(data: Dict[str, Any]) -> None:
"""Display structured scraping results"""
col1, col2 = st.columns([3, 2])
with col1:
st.subheader("Structured Data")
st.markdown(f"**Description:** {data.get('description', 'N/A')}")
display_section("🏒 Founders", data.get("founders", []))
display_section("πŸ“± Social Media", data.get("social_media_links", []))
display_section("πŸŽ“ Academic Programs", list(data.get("academic_programs", {}).items()))
display_section("⭐ Key Features", data.get("key_features", []))
with col2:
st.subheader("Raw JSON Output")
st.code(json.dumps(data, indent=2), language="json")
def display_section(title: str, items: list) -> None:
"""Render a section with list items"""
if items:
st.markdown(f"**{title}**")
for item in items:
st.markdown(f"- {str(item)}")
def initialize_text_generation_pipeline():
"""
Initialize a text generation pipeline with specific configurations.
This method creates a pipeline that:
- Uses the DeepSeek model
- Configured for text generation
- Optimized for JSON-like outputs
"""
return pipeline(
"text-generation",
model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
device_map="auto",
torch_dtype=torch.float16,
trust_remote_code=True,
return_full_text=False,
max_new_tokens=1024,
temperature=0.1,
top_p=0.9,
repetition_penalty=1.1
)
def run_scraping_pipeline(url: str) -> Dict[str, Any]:
"""Execute the complete scraping workflow"""
try:
# Initialize text generation pipeline
text_pipeline = initialize_text_generation_pipeline()
# Configure scraping graph
scraper_config = {
"llm": {
"model": text_pipeline,
"model_tokens": 8192,
"json_mode": True,
"response_format": {"type": "json_object"}
},
"verbose": False,
"headless": True
}
# Initialize scraper
scraper = SmartScraperGraph(
prompt="""Extract structured information as JSON with:
- Company description (string)
- Founders (array of names)
- Social media links (array of URLs)
- Academic programs (object with program types)
- Key features (array of strings)
Return ONLY valid JSON using this structure. Ensure the JSON is well-formed:
{
"description": "...",
"founders": [],
"social_media_links": [],
"academic_programs": {},
"key_features": []
}""",
source=url,
config=scraper_config
)
# Execute scraping
raw_result = scraper.run()
return process_scraping_result(raw_result)
except ValidationError as ve:
return {"error": f"Validation error: {str(ve)}"}
except json.JSONDecodeError as je:
return {"error": f"JSON parsing error: {str(je)}"}
except Exception as e:
return {"error": f"Scraping failed: {str(e)}"}
def process_scraping_result(raw_data: Union[str, Dict]) -> Dict[str, Any]:
"""Process and validate scraping results"""
# Convert to dict if needed
if isinstance(raw_data, str):
try:
parsed_data = json.loads(raw_data)
except json.JSONDecodeError:
parsed_data = attempt_json_recovery(raw_data)
else:
parsed_data = raw_data
# Validate against schema
validate(instance=parsed_data, schema=RESULT_SCHEMA)
# Type conversions
parsed_data["founders"] = list(map(str, parsed_data["founders"]))
parsed_data["social_media_links"] = [str(link) for link in parsed_data["social_media_links"]]
return parsed_data
def attempt_json_recovery(raw_str: str, max_attempts: int = 3) -> Dict:
"""Attempt to fix malformed JSON strings"""
for attempt in range(max_attempts):
try:
# Try cleaning common formatting issues
cleaned = raw_str.strip()
cleaned = cleaned.replace("```json", "").replace("```", "")
return json.loads(cleaned)
except json.JSONDecodeError:
continue
raise json.JSONDecodeError("Failed to recover JSON after multiple attempts")
if __name__ == "__main__":
main()