Spaces:

rodandegulle
/

Deepseek_webcrawler

Sleeping

App Files Files Community

Deepseek_webcrawler / app.py

rodandegulle

Update app.py

8099efd verified about 1 year ago

raw

history blame contribute delete

6.04 kB

	import streamlit as st
	from scrapegraphai.graphs import SmartScraperGraph
	from transformers import pipeline
	import torch
	import json
	from typing import Dict, Any, Union
	from jsonschema import validate, ValidationError

	# JSON schema for result validation
	RESULT_SCHEMA = {
	"type": "object",
	"properties": {
	"description": {"type": "string"},
	"founders": {"type": "array"},
	"social_media_links": {"type": "array"},
	"academic_programs": {"type": "object"},
	"key_features": {"type": "array"}
	},
	"required": ["description", "founders", "social_media_links", "academic_programs", "key_features"]
	}

	def main() -> None:
	"""Main application interface"""
	st.set_page_config(page_title="Web Scraper", layout="wide")
	st.title("🔍 Smart Web Scraping Assistant")

	with st.container():
	url = st.text_input("Enter website URL:", "https://www.sliit.lk/")

	if st.button("Start Scraping", type="primary"):
	with st.status("Processing...", expanded=True) as status:
	try:
	result = run_scraping_pipeline(url)
	if "error" in result:
	st.error(f"🚨 Error: {result['error']}")
	else:
	status.update(label="Processing complete!", state="complete")
	display_scraping_results(result)
	except Exception as e:
	st.error(f"Critical failure: {str(e)}")

	def display_scraping_results(data: Dict[str, Any]) -> None:
	"""Display structured scraping results"""
	col1, col2 = st.columns([3, 2])

	with col1:
	st.subheader("Structured Data")
	st.markdown(f"Description: {data.get('description', 'N/A')}")

	display_section("🏢 Founders", data.get("founders", []))
	display_section("📱 Social Media", data.get("social_media_links", []))
	display_section("🎓 Academic Programs", list(data.get("academic_programs", {}).items()))
	display_section("⭐ Key Features", data.get("key_features", []))

	with col2:
	st.subheader("Raw JSON Output")
	st.code(json.dumps(data, indent=2), language="json")

	def display_section(title: str, items: list) -> None:
	"""Render a section with list items"""
	if items:
	st.markdown(f"{title}")
	for item in items:
	st.markdown(f"- {str(item)}")

	def initialize_text_generation_pipeline():
	"""
	Initialize a text generation pipeline with specific configurations.

	This method creates a pipeline that:
	- Uses the DeepSeek model
	- Configured for text generation
	- Optimized for JSON-like outputs
	"""
	return pipeline(
	"text-generation",
	model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
	device_map="auto",
	torch_dtype=torch.float16,
	trust_remote_code=True,
	return_full_text=False,
	max_new_tokens=1024,
	temperature=0.1,
	top_p=0.9,
	repetition_penalty=1.1
	)

	def run_scraping_pipeline(url: str) -> Dict[str, Any]:
	"""Execute the complete scraping workflow"""
	try:
	# Initialize text generation pipeline
	text_pipeline = initialize_text_generation_pipeline()

	# Configure scraping graph
	scraper_config = {
	"llm": {
	"model": text_pipeline,
	"model_tokens": 8192,
	"json_mode": True,
	"response_format": {"type": "json_object"}
	},
	"verbose": False,
	"headless": True
	}

	# Initialize scraper
	scraper = SmartScraperGraph(
	prompt="""Extract structured information as JSON with:
	- Company description (string)
	- Founders (array of names)
	- Social media links (array of URLs)
	- Academic programs (object with program types)
	- Key features (array of strings)

	Return ONLY valid JSON using this structure. Ensure the JSON is well-formed:
	{
	"description": "...",
	"founders": [],
	"social_media_links": [],
	"academic_programs": {},
	"key_features": []
	}""",
	source=url,
	config=scraper_config
	)

	# Execute scraping
	raw_result = scraper.run()
	return process_scraping_result(raw_result)

	except ValidationError as ve:
	return {"error": f"Validation error: {str(ve)}"}
	except json.JSONDecodeError as je:
	return {"error": f"JSON parsing error: {str(je)}"}
	except Exception as e:
	return {"error": f"Scraping failed: {str(e)}"}

	def process_scraping_result(raw_data: Union[str, Dict]) -> Dict[str, Any]:
	"""Process and validate scraping results"""
	# Convert to dict if needed
	if isinstance(raw_data, str):
	try:
	parsed_data = json.loads(raw_data)
	except json.JSONDecodeError:
	parsed_data = attempt_json_recovery(raw_data)
	else:
	parsed_data = raw_data

	# Validate against schema
	validate(instance=parsed_data, schema=RESULT_SCHEMA)

	# Type conversions
	parsed_data["founders"] = list(map(str, parsed_data["founders"]))
	parsed_data["social_media_links"] = [str(link) for link in parsed_data["social_media_links"]]

	return parsed_data

	def attempt_json_recovery(raw_str: str, max_attempts: int = 3) -> Dict:
	"""Attempt to fix malformed JSON strings"""
	for attempt in range(max_attempts):
	try:
	# Try cleaning common formatting issues
	cleaned = raw_str.strip()
	cleaned = cleaned.replace("```json", "").replace("```", "")
	return json.loads(cleaned)
	except json.JSONDecodeError:
	continue
	raise json.JSONDecodeError("Failed to recover JSON after multiple attempts")

	if __name__ == "__main__":
	main()