Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Paused

App Files Files Community

AI_Powered_Web_Scraper / app.py

MagicMeWizard

Update app.py

6d85bb5 verified 10 months ago

raw

history blame

38.2 kB

	"""
	AI Dataset Studio - Complete Application
	Fixed version with all classes properly defined
	"""

	import gradio as gr
	import pandas as pd
	import numpy as np
	import json
	import re
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse, urljoin
	from datetime import datetime, timedelta
	import logging
	from typing import Dict, List, Tuple, Optional, Any
	from dataclasses import dataclass, asdict
	from pathlib import Path
	import uuid
	import hashlib
	import time
	from collections import defaultdict
	import io

	# Optional imports with fallbacks
	try:
	from transformers import pipeline, AutoTokenizer, AutoModel
	HAS_TRANSFORMERS = True
	except ImportError:
	HAS_TRANSFORMERS = False

	try:
	import nltk
	from nltk.tokenize import sent_tokenize, word_tokenize
	HAS_NLTK = True
	except ImportError:
	HAS_NLTK = False

	try:
	from datasets import Dataset, DatasetDict
	HAS_DATASETS = True
	except ImportError:
	HAS_DATASETS = False

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Download NLTK data if available
	if HAS_NLTK:
	try:
	nltk.download('punkt', quiet=True)
	nltk.download('stopwords', quiet=True)
	nltk.download('averaged_perceptron_tagger', quiet=True)
	except:
	pass

	@dataclass
	class ScrapedItem:
	"""Data class for scraped content"""
	id: str
	url: str
	title: str
	content: str
	metadata: Dict[str, Any]
	scraped_at: str
	word_count: int
	language: str = "en"
	quality_score: float = 0.0
	labels: List[str] = None
	annotations: Dict[str, Any] = None

	def __post_init__(self):
	if self.labels is None:
	self.labels = []
	if self.annotations is None:
	self.annotations = {}

	@dataclass
	class DatasetTemplate:
	"""Template for dataset creation"""
	name: str
	description: str
	task_type: str
	required_fields: List[str]
	optional_fields: List[str]
	example_format: Dict[str, Any]
	instructions: str

	class SecurityValidator:
	"""Security validation for URLs and content"""

	ALLOWED_SCHEMES = {'http', 'https'}
	BLOCKED_DOMAINS = {
	'localhost', '127.0.0.1', '0.0.0.0',
	'192.168.', '10.', '172.16.', '172.17.',
	'172.18.', '172.19.', '172.20.', '172.21.',
	'172.22.', '172.23.', '172.24.', '172.25.',
	'172.26.', '172.27.', '172.28.', '172.29.',
	'172.30.', '172.31.'
	}

	@classmethod
	def validate_url(cls, url: str) -> Tuple[bool, str]:
	"""Validate URL for security concerns"""
	try:
	parsed = urlparse(url)

	if parsed.scheme not in cls.ALLOWED_SCHEMES:
	return False, f"Invalid scheme: {parsed.scheme}"

	hostname = parsed.hostname or ''
	if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS):
	return False, "Access to internal networks not allowed"

	if not parsed.netloc:
	return False, "Invalid URL format"

	return True, "URL is valid"

	except Exception as e:
	return False, f"URL validation error: {str(e)}"

	class WebScraperEngine:
	"""Advanced web scraping engine"""

	def __init__(self):
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Connection': 'keep-alive',
	})

	def scrape_url(self, url: str) -> Optional[ScrapedItem]:
	"""Scrape a single URL"""
	try:
	# Validate URL
	is_valid, validation_msg = SecurityValidator.validate_url(url)
	if not is_valid:
	raise ValueError(f"Security validation failed: {validation_msg}")

	# Fetch content
	response = self.session.get(url, timeout=15)
	response.raise_for_status()

	# Parse HTML
	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract data
	title = self._extract_title(soup)
	content = self._extract_content(soup)
	metadata = self._extract_metadata(soup, response)

	# Create item
	item = ScrapedItem(
	id=str(uuid.uuid4()),
	url=url,
	title=title,
	content=content,
	metadata=metadata,
	scraped_at=datetime.now().isoformat(),
	word_count=len(content.split()),
	quality_score=self._assess_quality(content)
	)

	return item

	except Exception as e:
	logger.error(f"Failed to scrape {url}: {e}")
	return None

	def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]:
	"""Scrape multiple URLs"""
	results = []
	total = len(urls)

	for i, url in enumerate(urls):
	if progress_callback:
	progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...")

	item = self.scrape_url(url)
	if item:
	results.append(item)

	time.sleep(1) # Rate limiting

	return results

	def _extract_title(self, soup: BeautifulSoup) -> str:
	"""Extract page title"""
	title_tag = soup.find('title')
	if title_tag:
	return title_tag.get_text().strip()

	h1_tag = soup.find('h1')
	if h1_tag:
	return h1_tag.get_text().strip()

	return "Untitled"

	def _extract_content(self, soup: BeautifulSoup) -> str:
	"""Extract main content"""
	# Remove unwanted elements
	for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
	element.decompose()

	# Try content selectors
	content_selectors = [
	'article', 'main', '.content', '.post-content',
	'.entry-content', '.article-body'
	]

	for selector in content_selectors:
	element = soup.select_one(selector)
	if element:
	text = element.get_text(separator=' ', strip=True)
	if len(text) > 200:
	return self._clean_text(text)

	# Fallback to body
	body = soup.find('body')
	if body:
	return self._clean_text(body.get_text(separator=' ', strip=True))

	return self._clean_text(soup.get_text(separator=' ', strip=True))

	def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]:
	"""Extract metadata"""
	metadata = {
	'domain': urlparse(response.url).netloc,
	'status_code': response.status_code,
	'extracted_at': datetime.now().isoformat()
	}

	# Extract meta tags
	for tag in ['description', 'keywords', 'author']:
	element = soup.find('meta', attrs={'name': tag})
	if element:
	metadata[tag] = element.get('content', '')

	return metadata

	def _clean_text(self, text: str) -> str:
	"""Clean extracted text"""
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE)
	text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE)
	return text.strip()

	def _assess_quality(self, content: str) -> float:
	"""Assess content quality"""
	if not content:
	return 0.0

	score = 0.0
	word_count = len(content.split())

	if word_count >= 50:
	score += 0.4
	elif word_count >= 20:
	score += 0.2

	sentence_count = len(re.split(r'[.!?]+', content))
	if sentence_count >= 3:
	score += 0.3

	if re.search(r'[A-Z][a-z]+', content):
	score += 0.3

	return min(score, 1.0)

	class DataProcessor:
	"""Data processing pipeline"""

	def __init__(self):
	self.sentiment_analyzer = None
	self.ner_model = None
	self._load_models()

	def _load_models(self):
	"""Load NLP models"""
	if not HAS_TRANSFORMERS:
	logger.warning("⚠️ Transformers not available")
	return

	try:
	self.sentiment_analyzer = pipeline(
	"sentiment-analysis",
	model="cardiffnlp/twitter-roberta-base-sentiment-latest"
	)
	logger.info("✅ Sentiment model loaded")
	except Exception as e:
	logger.warning(f"⚠️ Could not load sentiment model: {e}")

	def process_items(self, items: List[ScrapedItem], options: Dict[str, bool]) -> List[ScrapedItem]:
	"""Process scraped items"""
	processed = []

	for item in items:
	try:
	# Clean text
	if options.get('clean_text', True):
	item.content = self._clean_text_advanced(item.content)

	# Quality filter
	if options.get('quality_filter', True) and item.quality_score < 0.3:
	continue

	# Add sentiment
	if options.get('add_sentiment', False) and self.sentiment_analyzer:
	sentiment = self._analyze_sentiment(item.content)
	item.metadata['sentiment'] = sentiment

	# Language detection
	if options.get('detect_language', True):
	item.language = self._detect_language(item.content)

	processed.append(item)

	except Exception as e:
	logger.error(f"Error processing item {item.id}: {e}")
	continue

	return processed

	def _clean_text_advanced(self, text: str) -> str:
	"""Advanced text cleaning"""
	text = re.sub(r'http\S+\|www\.\S+', '', text)
	text = re.sub(r'\S+@\S+', '', text)
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def _analyze_sentiment(self, text: str) -> Dict[str, Any]:
	"""Analyze sentiment"""
	try:
	text_sample = text[:512]
	result = self.sentiment_analyzer(text_sample)[0]
	return {
	'label': result['label'],
	'score': result['score']
	}
	except:
	return {'label': 'UNKNOWN', 'score': 0.0}

	def _detect_language(self, text: str) -> str:
	"""Simple language detection"""
	if re.search(r'[а-яё]', text.lower()):
	return 'ru'
	elif re.search(r'[ñáéíóúü]', text.lower()):
	return 'es'
	return 'en'

	class AnnotationEngine:
	"""Annotation tools for dataset creation"""

	def __init__(self):
	self.templates = self._load_templates()

	def _load_templates(self) -> Dict[str, DatasetTemplate]:
	"""Load dataset templates"""
	templates = {
	'text_classification': DatasetTemplate(
	name="Text Classification",
	description="Classify text into categories",
	task_type="classification",
	required_fields=["text", "label"],
	optional_fields=["confidence", "metadata"],
	example_format={"text": "Sample text", "label": "positive"},
	instructions="Label each text with appropriate category"
	),
	'sentiment_analysis': DatasetTemplate(
	name="Sentiment Analysis",
	description="Analyze emotional tone",
	task_type="classification",
	required_fields=["text", "sentiment"],
	optional_fields=["confidence", "aspects"],
	example_format={"text": "I love this!", "sentiment": "positive"},
	instructions="Classify sentiment as positive, negative, or neutral"
	),
	'named_entity_recognition': DatasetTemplate(
	name="Named Entity Recognition",
	description="Identify named entities",
	task_type="ner",
	required_fields=["text", "entities"],
	optional_fields=["metadata"],
	example_format={
	"text": "John works at OpenAI",
	"entities": [{"text": "John", "label": "PERSON"}]
	},
	instructions="Mark all named entities"
	),
	'question_answering': DatasetTemplate(
	name="Question Answering",
	description="Create Q&A pairs",
	task_type="qa",
	required_fields=["context", "question", "answer"],
	optional_fields=["answer_start", "metadata"],
	example_format={
	"context": "The capital of France is Paris.",
	"question": "What is the capital of France?",
	"answer": "Paris"
	},
	instructions="Create meaningful questions and answers"
	),
	'summarization': DatasetTemplate(
	name="Text Summarization",
	description="Create summaries",
	task_type="summarization",
	required_fields=["text", "summary"],
	optional_fields=["summary_type", "length"],
	example_format={
	"text": "Long article text...",
	"summary": "Brief summary"
	},
	instructions="Write clear, concise summaries"
	)
	}
	return templates

	class DatasetExporter:
	"""Export datasets in various formats"""

	def __init__(self):
	self.supported_formats = [
	'json', 'csv', 'jsonl', 'huggingface_datasets'
	]

	def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate,
	export_format: str, annotations: Dict[str, Any] = None) -> str:
	"""Export dataset"""
	try:
	dataset_data = self._prepare_data(items, template, annotations)

	if export_format == 'json':
	return self._export_json(dataset_data)
	elif export_format == 'csv':
	return self._export_csv(dataset_data)
	elif export_format == 'jsonl':
	return self._export_jsonl(dataset_data)
	elif export_format == 'huggingface_datasets':
	return self._export_huggingface(dataset_data, template)
	else:
	raise ValueError(f"Unsupported format: {export_format}")

	except Exception as e:
	logger.error(f"Export failed: {e}")
	raise

	def _prepare_data(self, items: List[ScrapedItem], template: DatasetTemplate,
	annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]:
	"""Prepare data according to template"""
	dataset_data = []

	for item in items:
	data_point = {
	'text': item.content,
	'title': item.title,
	'url': item.url,
	'metadata': item.metadata
	}

	if annotations and item.id in annotations:
	data_point.update(annotations[item.id])

	formatted = self._format_for_template(data_point, template)
	if formatted:
	dataset_data.append(formatted)

	return dataset_data

	def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]:
	"""Format data according to template"""
	formatted = {}

	for field in template.required_fields:
	if field in data_point:
	formatted[field] = data_point[field]
	elif field == 'text' and 'content' in data_point:
	formatted[field] = data_point['content']
	else:
	return None

	for field in template.optional_fields:
	if field in data_point:
	formatted[field] = data_point[field]

	return formatted

	def _export_json(self, data: List[Dict[str, Any]]) -> str:
	"""Export as JSON"""
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"dataset_{timestamp}.json"

	with open(filename, 'w', encoding='utf-8') as f:
	json.dump(data, f, indent=2, ensure_ascii=False)

	return filename

	def _export_csv(self, data: List[Dict[str, Any]]) -> str:
	"""Export as CSV"""
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"dataset_{timestamp}.csv"

	df = pd.DataFrame(data)
	df.to_csv(filename, index=False)

	return filename

	def _export_jsonl(self, data: List[Dict[str, Any]]) -> str:
	"""Export as JSONL"""
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"dataset_{timestamp}.jsonl"

	with open(filename, 'w', encoding='utf-8') as f:
	for item in data:
	f.write(json.dumps(item, ensure_ascii=False) + '\n')

	return filename

	def _export_huggingface(self, data: List[Dict[str, Any]], template: DatasetTemplate) -> str:
	"""Export as HuggingFace Dataset"""
	if not HAS_DATASETS:
	raise ImportError("datasets library not available")

	dataset = Dataset.from_list(data)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}"

	dataset.save_to_disk(dataset_name)
	return dataset_name

	class DatasetStudio:
	"""Main application orchestrator"""

	def __init__(self):
	self.scraper = WebScraperEngine()
	self.processor = DataProcessor()
	self.annotator = AnnotationEngine()
	self.exporter = DatasetExporter()

	# Application state
	self.scraped_items = []
	self.processed_items = []
	self.current_project = None
	self.annotation_state = {}

	logger.info("✅ DatasetStudio initialized successfully")

	def start_new_project(self, project_name: str, template_type: str) -> Dict[str, Any]:
	"""Start new project"""
	self.current_project = {
	'name': project_name,
	'template': template_type,
	'created_at': datetime.now().isoformat(),
	'id': str(uuid.uuid4())
	}

	self.scraped_items = []
	self.processed_items = []
	self.annotation_state = {}

	logger.info(f"📋 New project: {project_name}")
	return self.current_project

	def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]:
	"""Scrape URLs"""
	url_list = [url.strip() for url in urls if url.strip()]

	if not url_list:
	return 0, ["No valid URLs provided"]

	logger.info(f"🕷️ Scraping {len(url_list)} URLs")
	self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback)

	success = len(self.scraped_items)
	failed = len(url_list) - success

	errors = []
	if failed > 0:
	errors.append(f"{failed} URLs failed")

	logger.info(f"✅ Scraped {success}, failed {failed}")
	return success, errors

	def process_data(self, options: Dict[str, bool]) -> int:
	"""Process scraped data"""
	if not self.scraped_items:
	return 0

	logger.info(f"⚙️ Processing {len(self.scraped_items)} items")
	self.processed_items = self.processor.process_items(self.scraped_items, options)

	logger.info(f"✅ Processed {len(self.processed_items)} items")
	return len(self.processed_items)

	def get_data_preview(self, num_items: int = 5) -> List[Dict[str, Any]]:
	"""Get data preview"""
	items = self.processed_items or self.scraped_items

	preview = []
	for item in items[:num_items]:
	preview.append({
	'title': item.title,
	'content_preview': item.content[:200] + "..." if len(item.content) > 200 else item.content,
	'word_count': item.word_count,
	'quality_score': round(item.quality_score, 2),
	'url': item.url
	})

	return preview

	def get_data_statistics(self) -> Dict[str, Any]:
	"""Get dataset statistics"""
	items = self.processed_items or self.scraped_items

	if not items:
	return {}

	word_counts = [item.word_count for item in items]
	quality_scores = [item.quality_score for item in items]

	return {
	'total_items': len(items),
	'avg_word_count': round(np.mean(word_counts)),
	'avg_quality_score': round(np.mean(quality_scores), 2),
	'word_count_range': [min(word_counts), max(word_counts)],
	'quality_range': [round(min(quality_scores), 2), round(max(quality_scores), 2)],
	'languages': list(set(item.language for item in items)),
	'domains': list(set(urlparse(item.url).netloc for item in items))
	}

	def export_dataset(self, template_name: str, export_format: str, annotations: Dict[str, Any] = None) -> str:
	"""Export dataset"""
	if not self.processed_items and not self.scraped_items:
	raise ValueError("No data to export")

	items = self.processed_items or self.scraped_items
	template = self.annotator.templates.get(template_name)

	if not template:
	raise ValueError(f"Unknown template: {template_name}")

	logger.info(f"📤 Exporting {len(items)} items")
	return self.exporter.export_dataset(items, template, export_format, annotations)

	def create_modern_interface():
	"""Create the modern Gradio interface"""

	# Initialize studio
	studio = DatasetStudio()

	# Custom CSS
	css = """
	.gradio-container { max-width: 1400px; margin: auto; }
	.studio-header {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white; padding: 2rem; border-radius: 15px;
	margin-bottom: 2rem; text-align: center;
	}
	.workflow-card {
	background: #f8f9ff; border: 2px solid #e1e5ff;
	border-radius: 12px; padding: 1.5rem; margin: 1rem 0;
	}
	.step-header {
	font-size: 1.2em; font-weight: 600; color: #4c51bf;
	margin-bottom: 1rem;
	}
	"""

	project_state = gr.State({})

	with gr.Blocks(css=css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface:

	# Header
	gr.HTML("""
	<div class="studio-header">
	<h1>🚀 AI Dataset Studio</h1>
	<p>Create high-quality training datasets without coding</p>
	</div>
	""")

	with gr.Tabs() as main_tabs:

	# Project Setup
	with gr.Tab("🎯 Project Setup"):
	gr.HTML('<div class="step-header">Step 1: Create Your Project</div>')

	with gr.Row():
	with gr.Column(scale=2):
	project_name = gr.Textbox(
	label="Project Name",
	placeholder="My Dataset Project",
	value="News Analysis Dataset"
	)

	template_choice = gr.Radio(
	choices=[
	("📊 Text Classification", "text_classification"),
	("😊 Sentiment Analysis", "sentiment_analysis"),
	("👥 Named Entity Recognition", "named_entity_recognition"),
	("❓ Question Answering", "question_answering"),
	("📝 Text Summarization", "summarization")
	],
	label="Dataset Type",
	value="text_classification"
	)

	create_project_btn = gr.Button("🚀 Create Project", variant="primary")
	project_status = gr.Markdown("")

	with gr.Column(scale=1):
	gr.HTML("""
	<div class="workflow-card">
	<h3>💡 Template Guide</h3>
	<p><strong>Text Classification:</strong> Categorize content</p>
	<p><strong>Sentiment Analysis:</strong> Analyze emotions</p>
	<p><strong>Named Entity Recognition:</strong> Identify entities</p>
	<p><strong>Question Answering:</strong> Create Q&A pairs</p>
	<p><strong>Summarization:</strong> Generate summaries</p>
	</div>
	""")

	# Data Collection
	with gr.Tab("🕷️ Data Collection"):
	gr.HTML('<div class="step-header">Step 2: Collect Your Data</div>')

	with gr.Row():
	with gr.Column(scale=2):
	urls_input = gr.Textbox(
	label="URLs to Scrape (one per line)",
	placeholder="https://example.com/article1\nhttps://example.com/article2",
	lines=8
	)

	scrape_btn = gr.Button("🚀 Start Scraping", variant="primary")
	scraping_status = gr.Markdown("")

	with gr.Column(scale=1):
	collection_stats = gr.HTML("")

	# Data Processing
	with gr.Tab("⚙️ Data Processing"):
	gr.HTML('<div class="step-header">Step 3: Clean & Enhance</div>')

	with gr.Row():
	with gr.Column(scale=2):
	with gr.Row():
	with gr.Column():
	clean_text = gr.Checkbox(label="🧹 Text Cleaning", value=True)
	quality_filter = gr.Checkbox(label="🎯 Quality Filter", value=True)
	detect_language = gr.Checkbox(label="🌍 Language Detection", value=True)

	with gr.Column():
	add_sentiment = gr.Checkbox(label="😊 Sentiment Analysis", value=False)
	extract_entities = gr.Checkbox(label="👥 Entity Extraction", value=False)

	process_btn = gr.Button("⚙️ Process Data", variant="primary")
	processing_status = gr.Markdown("")

	with gr.Column(scale=1):
	processing_stats = gr.HTML("")

	# Data Preview
	with gr.Tab("👀 Data Preview"):
	gr.HTML('<div class="step-header">Step 4: Review Dataset</div>')

	with gr.Row():
	with gr.Column(scale=2):
	refresh_btn = gr.Button("🔄 Refresh Preview", variant="secondary")

	data_preview = gr.DataFrame(
	headers=["Title", "Content Preview", "Words", "Quality", "URL"],
	label="Dataset Preview"
	)

	with gr.Column(scale=1):
	dataset_stats = gr.JSON(label="Statistics")

	# Export
	with gr.Tab("📤 Export Dataset"):
	gr.HTML('<div class="step-header">Step 5: Export Your Dataset</div>')

	with gr.Row():
	with gr.Column(scale=2):
	export_format = gr.Radio(
	choices=[
	("📄 JSON", "json"),
	("📊 CSV", "csv"),
	("📋 JSONL", "jsonl"),
	("🤗 HuggingFace", "huggingface_datasets")
	],
	label="Export Format",
	value="json"
	)

	export_template = gr.Dropdown(
	choices=[
	"text_classification",
	"sentiment_analysis",
	"named_entity_recognition",
	"question_answering",
	"summarization"
	],
	label="Template",
	value="text_classification"
	)

	export_btn = gr.Button("📤 Export Dataset", variant="primary")
	export_status = gr.Markdown("")
	export_file = gr.File(label="Download", visible=False)

	with gr.Column(scale=1):
	gr.HTML("""
	<div class="workflow-card">
	<h3>📋 Export Info</h3>
	<p><strong>JSON:</strong> Universal format</p>
	<p><strong>CSV:</strong> Excel compatible</p>
	<p><strong>JSONL:</strong> Line-separated</p>
	<p><strong>HuggingFace:</strong> ML ready</p>
	</div>
	""")

	# Event handlers
	def create_project(name, template):
	if not name.strip():
	return "❌ Please enter a project name", {}

	project = studio.start_new_project(name.strip(), template)
	status = f"""
	✅ Project Created!

	Name: {project['name']}
	Type: {template.replace('_', ' ').title()}
	ID: {project['id'][:8]}...

	👉 Next: Go to Data Collection tab
	"""
	return status, project

	def scrape_urls_handler(urls_text, project, progress=gr.Progress()):
	if not project:
	return "❌ Create a project first", ""

	urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
	if not urls:
	return "❌ No URLs provided", ""

	def progress_callback(pct, msg):
	progress(pct, desc=msg)

	success, errors = studio.scrape_urls(urls, progress_callback)

	if success > 0:
	stats = f"""
	<div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;">
	<h3>✅ Scraping Complete</h3>
	<p><strong>{success}</strong> items collected</p>
	</div>
	"""

	status = f"""
	✅ Scraping Complete!

	Success: {success} URLs
	Failed: {len(urls) - success} URLs

	👉 Next: Go to Data Processing tab
	"""

	return status, stats
	else:
	return f"❌ Scraping failed: {', '.join(errors)}", ""

	def process_data_handler(clean, quality, language, sentiment, entities, project):
	if not project:
	return "❌ Create a project first", ""

	if not studio.scraped_items:
	return "❌ No data to process. Scrape URLs first.", ""

	options = {
	'clean_text': clean,
	'quality_filter': quality,
	'detect_language': language,
	'add_sentiment': sentiment,
	'extract_entities': entities
	}

	processed = studio.process_data(options)

	if processed > 0:
	stats = studio.get_data_statistics()
	stats_html = f"""
	<div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;">
	<h3>⚙️ Processing Complete</h3>
	<p><strong>{processed}</strong> items processed</p>
	<p>Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p>
	</div>
	"""

	status = f"""
	✅ Processing Complete!

	Processed: {processed} items
	Avg Quality: {stats.get('avg_quality_score', 0)}

	👉 Next: Check Data Preview tab
	"""

	return status, stats_html
	else:
	return "❌ No items passed filters", ""

	def refresh_preview_handler(project):
	if not project:
	return None, {}

	preview = studio.get_data_preview()
	stats = studio.get_data_statistics()

	if preview:
	df_data = []
	for item in preview:
	df_data.append([
	item['title'][:50] + "..." if len(item['title']) > 50 else item['title'],
	item['content_preview'],
	item['word_count'],
	item['quality_score'],
	item['url'][:50] + "..." if len(item['url']) > 50 else item['url']
	])

	return df_data, stats

	return None, {}

	def export_handler(format_type, template, project):
	if not project:
	return "❌ Create a project first", None

	if not studio.processed_items and not studio.scraped_items:
	return "❌ No data to export", None

	try:
	filename = studio.export_dataset(template, format_type)

	status = f"""
	✅ Export Successful!

	Format: {format_type}
	File: {filename}

	📥 Download link below
	"""

	return status, filename

	except Exception as e:
	return f"❌ Export failed: {str(e)}", None

	# Connect events
	create_project_btn.click(
	fn=create_project,
	inputs=[project_name, template_choice],
	outputs=[project_status, project_state]
	)

	scrape_btn.click(
	fn=scrape_urls_handler,
	inputs=[urls_input, project_state],
	outputs=[scraping_status, collection_stats]
	)

	process_btn.click(
	fn=process_data_handler,
	inputs=[clean_text, quality_filter, detect_language,
	add_sentiment, extract_entities, project_state],
	outputs=[processing_status, processing_stats]
	)

	refresh_btn.click(
	fn=refresh_preview_handler,
	inputs=[project_state],
	outputs=[data_preview, dataset_stats]
	)

	export_btn.click(
	fn=export_handler,
	inputs=[export_format, export_template, project_state],
	outputs=[export_status, export_file]
	)

	return interface

	# Launch application
	if __name__ == "__main__":
	logger.info("🚀 Starting AI Dataset Studio...")

	# Check features
	features = []
	if HAS_TRANSFORMERS:
	features.append("✅ AI Models")
	else:
	features.append("⚠️ Basic Processing")

	if HAS_NLTK:
	features.append("✅ Advanced NLP")
	else:
	features.append("⚠️ Basic NLP")

	if HAS_DATASETS:
	features.append("✅ HuggingFace Integration")
	else:
	features.append("⚠️ Standard Export")

	logger.info(f"📊 Features: {' \| '.join(features)}")

	try:
	# Test DatasetStudio
	test_studio = DatasetStudio()
	logger.info("✅ DatasetStudio test passed")

	interface = create_modern_interface()
	logger.info("✅ Interface created successfully")

	interface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)

	except Exception as e:
	logger.error(f"❌ Failed to launch: {e}")
	logger.error("💡 Try: python app_minimal.py")
	raise