| """ |
| AI Dataset Studio - Complete Application |
| Fixed version with all classes properly defined |
| """ |
|
|
| import gradio as gr |
| import pandas as pd |
| import numpy as np |
| import json |
| import re |
| import requests |
| from bs4 import BeautifulSoup |
| from urllib.parse import urlparse, urljoin |
| from datetime import datetime, timedelta |
| import logging |
| from typing import Dict, List, Tuple, Optional, Any |
| from dataclasses import dataclass, asdict |
| from pathlib import Path |
| import uuid |
| import hashlib |
| import time |
| from collections import defaultdict |
| import io |
|
|
| |
| try: |
| from transformers import pipeline, AutoTokenizer, AutoModel |
| HAS_TRANSFORMERS = True |
| except ImportError: |
| HAS_TRANSFORMERS = False |
|
|
| try: |
| import nltk |
| from nltk.tokenize import sent_tokenize, word_tokenize |
| HAS_NLTK = True |
| except ImportError: |
| HAS_NLTK = False |
|
|
| try: |
| from datasets import Dataset, DatasetDict |
| HAS_DATASETS = True |
| except ImportError: |
| HAS_DATASETS = False |
|
|
| |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| logger = logging.getLogger(__name__) |
|
|
| |
| if HAS_NLTK: |
| try: |
| nltk.download('punkt', quiet=True) |
| nltk.download('stopwords', quiet=True) |
| nltk.download('averaged_perceptron_tagger', quiet=True) |
| except: |
| pass |
|
|
| @dataclass |
| class ScrapedItem: |
| """Data class for scraped content""" |
| id: str |
| url: str |
| title: str |
| content: str |
| metadata: Dict[str, Any] |
| scraped_at: str |
| word_count: int |
| language: str = "en" |
| quality_score: float = 0.0 |
| labels: List[str] = None |
| annotations: Dict[str, Any] = None |
|
|
| def __post_init__(self): |
| if self.labels is None: |
| self.labels = [] |
| if self.annotations is None: |
| self.annotations = {} |
|
|
| @dataclass |
| class DatasetTemplate: |
| """Template for dataset creation""" |
| name: str |
| description: str |
| task_type: str |
| required_fields: List[str] |
| optional_fields: List[str] |
| example_format: Dict[str, Any] |
| instructions: str |
|
|
| class SecurityValidator: |
| """Security validation for URLs and content""" |
| |
| ALLOWED_SCHEMES = {'http', 'https'} |
| BLOCKED_DOMAINS = { |
| 'localhost', '127.0.0.1', '0.0.0.0', |
| '192.168.', '10.', '172.16.', '172.17.', |
| '172.18.', '172.19.', '172.20.', '172.21.', |
| '172.22.', '172.23.', '172.24.', '172.25.', |
| '172.26.', '172.27.', '172.28.', '172.29.', |
| '172.30.', '172.31.' |
| } |
| |
| @classmethod |
| def validate_url(cls, url: str) -> Tuple[bool, str]: |
| """Validate URL for security concerns""" |
| try: |
| parsed = urlparse(url) |
| |
| if parsed.scheme not in cls.ALLOWED_SCHEMES: |
| return False, f"Invalid scheme: {parsed.scheme}" |
| |
| hostname = parsed.hostname or '' |
| if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS): |
| return False, "Access to internal networks not allowed" |
| |
| if not parsed.netloc: |
| return False, "Invalid URL format" |
| |
| return True, "URL is valid" |
| |
| except Exception as e: |
| return False, f"URL validation error: {str(e)}" |
|
|
| class WebScraperEngine: |
| """Advanced web scraping engine""" |
| |
| def __init__(self): |
| self.session = requests.Session() |
| self.session.headers.update({ |
| 'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)', |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
| 'Accept-Language': 'en-US,en;q=0.5', |
| 'Connection': 'keep-alive', |
| }) |
| |
| def scrape_url(self, url: str) -> Optional[ScrapedItem]: |
| """Scrape a single URL""" |
| try: |
| |
| is_valid, validation_msg = SecurityValidator.validate_url(url) |
| if not is_valid: |
| raise ValueError(f"Security validation failed: {validation_msg}") |
| |
| |
| response = self.session.get(url, timeout=15) |
| response.raise_for_status() |
| |
| |
| soup = BeautifulSoup(response.content, 'html.parser') |
| |
| |
| title = self._extract_title(soup) |
| content = self._extract_content(soup) |
| metadata = self._extract_metadata(soup, response) |
| |
| |
| item = ScrapedItem( |
| id=str(uuid.uuid4()), |
| url=url, |
| title=title, |
| content=content, |
| metadata=metadata, |
| scraped_at=datetime.now().isoformat(), |
| word_count=len(content.split()), |
| quality_score=self._assess_quality(content) |
| ) |
| |
| return item |
| |
| except Exception as e: |
| logger.error(f"Failed to scrape {url}: {e}") |
| return None |
| |
| def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]: |
| """Scrape multiple URLs""" |
| results = [] |
| total = len(urls) |
| |
| for i, url in enumerate(urls): |
| if progress_callback: |
| progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...") |
| |
| item = self.scrape_url(url) |
| if item: |
| results.append(item) |
| |
| time.sleep(1) |
| |
| return results |
| |
| def _extract_title(self, soup: BeautifulSoup) -> str: |
| """Extract page title""" |
| title_tag = soup.find('title') |
| if title_tag: |
| return title_tag.get_text().strip() |
| |
| h1_tag = soup.find('h1') |
| if h1_tag: |
| return h1_tag.get_text().strip() |
| |
| return "Untitled" |
| |
| def _extract_content(self, soup: BeautifulSoup) -> str: |
| """Extract main content""" |
| |
| for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']): |
| element.decompose() |
| |
| |
| content_selectors = [ |
| 'article', 'main', '.content', '.post-content', |
| '.entry-content', '.article-body' |
| ] |
| |
| for selector in content_selectors: |
| element = soup.select_one(selector) |
| if element: |
| text = element.get_text(separator=' ', strip=True) |
| if len(text) > 200: |
| return self._clean_text(text) |
| |
| |
| body = soup.find('body') |
| if body: |
| return self._clean_text(body.get_text(separator=' ', strip=True)) |
| |
| return self._clean_text(soup.get_text(separator=' ', strip=True)) |
| |
| def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]: |
| """Extract metadata""" |
| metadata = { |
| 'domain': urlparse(response.url).netloc, |
| 'status_code': response.status_code, |
| 'extracted_at': datetime.now().isoformat() |
| } |
| |
| |
| for tag in ['description', 'keywords', 'author']: |
| element = soup.find('meta', attrs={'name': tag}) |
| if element: |
| metadata[tag] = element.get('content', '') |
| |
| return metadata |
| |
| def _clean_text(self, text: str) -> str: |
| """Clean extracted text""" |
| text = re.sub(r'\s+', ' ', text) |
| text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE) |
| text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE) |
| return text.strip() |
| |
| def _assess_quality(self, content: str) -> float: |
| """Assess content quality""" |
| if not content: |
| return 0.0 |
| |
| score = 0.0 |
| word_count = len(content.split()) |
| |
| if word_count >= 50: |
| score += 0.4 |
| elif word_count >= 20: |
| score += 0.2 |
| |
| sentence_count = len(re.split(r'[.!?]+', content)) |
| if sentence_count >= 3: |
| score += 0.3 |
| |
| if re.search(r'[A-Z][a-z]+', content): |
| score += 0.3 |
| |
| return min(score, 1.0) |
|
|
| class DataProcessor: |
| """Data processing pipeline""" |
| |
| def __init__(self): |
| self.sentiment_analyzer = None |
| self.ner_model = None |
| self._load_models() |
| |
| def _load_models(self): |
| """Load NLP models""" |
| if not HAS_TRANSFORMERS: |
| logger.warning("β οΈ Transformers not available") |
| return |
| |
| try: |
| self.sentiment_analyzer = pipeline( |
| "sentiment-analysis", |
| model="cardiffnlp/twitter-roberta-base-sentiment-latest" |
| ) |
| logger.info("β
Sentiment model loaded") |
| except Exception as e: |
| logger.warning(f"β οΈ Could not load sentiment model: {e}") |
| |
| def process_items(self, items: List[ScrapedItem], options: Dict[str, bool]) -> List[ScrapedItem]: |
| """Process scraped items""" |
| processed = [] |
| |
| for item in items: |
| try: |
| |
| if options.get('clean_text', True): |
| item.content = self._clean_text_advanced(item.content) |
| |
| |
| if options.get('quality_filter', True) and item.quality_score < 0.3: |
| continue |
| |
| |
| if options.get('add_sentiment', False) and self.sentiment_analyzer: |
| sentiment = self._analyze_sentiment(item.content) |
| item.metadata['sentiment'] = sentiment |
| |
| |
| if options.get('detect_language', True): |
| item.language = self._detect_language(item.content) |
| |
| processed.append(item) |
| |
| except Exception as e: |
| logger.error(f"Error processing item {item.id}: {e}") |
| continue |
| |
| return processed |
| |
| def _clean_text_advanced(self, text: str) -> str: |
| """Advanced text cleaning""" |
| text = re.sub(r'http\S+|www\.\S+', '', text) |
| text = re.sub(r'\S+@\S+', '', text) |
| text = re.sub(r'\s+', ' ', text) |
| return text.strip() |
| |
| def _analyze_sentiment(self, text: str) -> Dict[str, Any]: |
| """Analyze sentiment""" |
| try: |
| text_sample = text[:512] |
| result = self.sentiment_analyzer(text_sample)[0] |
| return { |
| 'label': result['label'], |
| 'score': result['score'] |
| } |
| except: |
| return {'label': 'UNKNOWN', 'score': 0.0} |
| |
| def _detect_language(self, text: str) -> str: |
| """Simple language detection""" |
| if re.search(r'[Π°-ΡΡ]', text.lower()): |
| return 'ru' |
| elif re.search(r'[ñÑéΓΓ³ΓΊΓΌ]', text.lower()): |
| return 'es' |
| return 'en' |
|
|
| class AnnotationEngine: |
| """Annotation tools for dataset creation""" |
| |
| def __init__(self): |
| self.templates = self._load_templates() |
| |
| def _load_templates(self) -> Dict[str, DatasetTemplate]: |
| """Load dataset templates""" |
| templates = { |
| 'text_classification': DatasetTemplate( |
| name="Text Classification", |
| description="Classify text into categories", |
| task_type="classification", |
| required_fields=["text", "label"], |
| optional_fields=["confidence", "metadata"], |
| example_format={"text": "Sample text", "label": "positive"}, |
| instructions="Label each text with appropriate category" |
| ), |
| 'sentiment_analysis': DatasetTemplate( |
| name="Sentiment Analysis", |
| description="Analyze emotional tone", |
| task_type="classification", |
| required_fields=["text", "sentiment"], |
| optional_fields=["confidence", "aspects"], |
| example_format={"text": "I love this!", "sentiment": "positive"}, |
| instructions="Classify sentiment as positive, negative, or neutral" |
| ), |
| 'named_entity_recognition': DatasetTemplate( |
| name="Named Entity Recognition", |
| description="Identify named entities", |
| task_type="ner", |
| required_fields=["text", "entities"], |
| optional_fields=["metadata"], |
| example_format={ |
| "text": "John works at OpenAI", |
| "entities": [{"text": "John", "label": "PERSON"}] |
| }, |
| instructions="Mark all named entities" |
| ), |
| 'question_answering': DatasetTemplate( |
| name="Question Answering", |
| description="Create Q&A pairs", |
| task_type="qa", |
| required_fields=["context", "question", "answer"], |
| optional_fields=["answer_start", "metadata"], |
| example_format={ |
| "context": "The capital of France is Paris.", |
| "question": "What is the capital of France?", |
| "answer": "Paris" |
| }, |
| instructions="Create meaningful questions and answers" |
| ), |
| 'summarization': DatasetTemplate( |
| name="Text Summarization", |
| description="Create summaries", |
| task_type="summarization", |
| required_fields=["text", "summary"], |
| optional_fields=["summary_type", "length"], |
| example_format={ |
| "text": "Long article text...", |
| "summary": "Brief summary" |
| }, |
| instructions="Write clear, concise summaries" |
| ) |
| } |
| return templates |
|
|
| class DatasetExporter: |
| """Export datasets in various formats""" |
| |
| def __init__(self): |
| self.supported_formats = [ |
| 'json', 'csv', 'jsonl', 'huggingface_datasets' |
| ] |
| |
| def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate, |
| export_format: str, annotations: Dict[str, Any] = None) -> str: |
| """Export dataset""" |
| try: |
| dataset_data = self._prepare_data(items, template, annotations) |
| |
| if export_format == 'json': |
| return self._export_json(dataset_data) |
| elif export_format == 'csv': |
| return self._export_csv(dataset_data) |
| elif export_format == 'jsonl': |
| return self._export_jsonl(dataset_data) |
| elif export_format == 'huggingface_datasets': |
| return self._export_huggingface(dataset_data, template) |
| else: |
| raise ValueError(f"Unsupported format: {export_format}") |
| |
| except Exception as e: |
| logger.error(f"Export failed: {e}") |
| raise |
| |
| def _prepare_data(self, items: List[ScrapedItem], template: DatasetTemplate, |
| annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]: |
| """Prepare data according to template""" |
| dataset_data = [] |
| |
| for item in items: |
| data_point = { |
| 'text': item.content, |
| 'title': item.title, |
| 'url': item.url, |
| 'metadata': item.metadata |
| } |
| |
| if annotations and item.id in annotations: |
| data_point.update(annotations[item.id]) |
| |
| formatted = self._format_for_template(data_point, template) |
| if formatted: |
| dataset_data.append(formatted) |
| |
| return dataset_data |
| |
| def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]: |
| """Format data according to template""" |
| formatted = {} |
| |
| for field in template.required_fields: |
| if field in data_point: |
| formatted[field] = data_point[field] |
| elif field == 'text' and 'content' in data_point: |
| formatted[field] = data_point['content'] |
| else: |
| return None |
| |
| for field in template.optional_fields: |
| if field in data_point: |
| formatted[field] = data_point[field] |
| |
| return formatted |
| |
| def _export_json(self, data: List[Dict[str, Any]]) -> str: |
| """Export as JSON""" |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| filename = f"dataset_{timestamp}.json" |
| |
| with open(filename, 'w', encoding='utf-8') as f: |
| json.dump(data, f, indent=2, ensure_ascii=False) |
| |
| return filename |
| |
| def _export_csv(self, data: List[Dict[str, Any]]) -> str: |
| """Export as CSV""" |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| filename = f"dataset_{timestamp}.csv" |
| |
| df = pd.DataFrame(data) |
| df.to_csv(filename, index=False) |
| |
| return filename |
| |
| def _export_jsonl(self, data: List[Dict[str, Any]]) -> str: |
| """Export as JSONL""" |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| filename = f"dataset_{timestamp}.jsonl" |
| |
| with open(filename, 'w', encoding='utf-8') as f: |
| for item in data: |
| f.write(json.dumps(item, ensure_ascii=False) + '\n') |
| |
| return filename |
| |
| def _export_huggingface(self, data: List[Dict[str, Any]], template: DatasetTemplate) -> str: |
| """Export as HuggingFace Dataset""" |
| if not HAS_DATASETS: |
| raise ImportError("datasets library not available") |
| |
| dataset = Dataset.from_list(data) |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}" |
| |
| dataset.save_to_disk(dataset_name) |
| return dataset_name |
|
|
| class DatasetStudio: |
| """Main application orchestrator""" |
| |
| def __init__(self): |
| self.scraper = WebScraperEngine() |
| self.processor = DataProcessor() |
| self.annotator = AnnotationEngine() |
| self.exporter = DatasetExporter() |
| |
| |
| self.scraped_items = [] |
| self.processed_items = [] |
| self.current_project = None |
| self.annotation_state = {} |
| |
| logger.info("β
DatasetStudio initialized successfully") |
| |
| def start_new_project(self, project_name: str, template_type: str) -> Dict[str, Any]: |
| """Start new project""" |
| self.current_project = { |
| 'name': project_name, |
| 'template': template_type, |
| 'created_at': datetime.now().isoformat(), |
| 'id': str(uuid.uuid4()) |
| } |
| |
| self.scraped_items = [] |
| self.processed_items = [] |
| self.annotation_state = {} |
| |
| logger.info(f"π New project: {project_name}") |
| return self.current_project |
| |
| def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]: |
| """Scrape URLs""" |
| url_list = [url.strip() for url in urls if url.strip()] |
| |
| if not url_list: |
| return 0, ["No valid URLs provided"] |
| |
| logger.info(f"π·οΈ Scraping {len(url_list)} URLs") |
| self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback) |
| |
| success = len(self.scraped_items) |
| failed = len(url_list) - success |
| |
| errors = [] |
| if failed > 0: |
| errors.append(f"{failed} URLs failed") |
| |
| logger.info(f"β
Scraped {success}, failed {failed}") |
| return success, errors |
| |
| def process_data(self, options: Dict[str, bool]) -> int: |
| """Process scraped data""" |
| if not self.scraped_items: |
| return 0 |
| |
| logger.info(f"βοΈ Processing {len(self.scraped_items)} items") |
| self.processed_items = self.processor.process_items(self.scraped_items, options) |
| |
| logger.info(f"β
Processed {len(self.processed_items)} items") |
| return len(self.processed_items) |
| |
| def get_data_preview(self, num_items: int = 5) -> List[Dict[str, Any]]: |
| """Get data preview""" |
| items = self.processed_items or self.scraped_items |
| |
| preview = [] |
| for item in items[:num_items]: |
| preview.append({ |
| 'title': item.title, |
| 'content_preview': item.content[:200] + "..." if len(item.content) > 200 else item.content, |
| 'word_count': item.word_count, |
| 'quality_score': round(item.quality_score, 2), |
| 'url': item.url |
| }) |
| |
| return preview |
| |
| def get_data_statistics(self) -> Dict[str, Any]: |
| """Get dataset statistics""" |
| items = self.processed_items or self.scraped_items |
| |
| if not items: |
| return {} |
| |
| word_counts = [item.word_count for item in items] |
| quality_scores = [item.quality_score for item in items] |
| |
| return { |
| 'total_items': len(items), |
| 'avg_word_count': round(np.mean(word_counts)), |
| 'avg_quality_score': round(np.mean(quality_scores), 2), |
| 'word_count_range': [min(word_counts), max(word_counts)], |
| 'quality_range': [round(min(quality_scores), 2), round(max(quality_scores), 2)], |
| 'languages': list(set(item.language for item in items)), |
| 'domains': list(set(urlparse(item.url).netloc for item in items)) |
| } |
| |
| def export_dataset(self, template_name: str, export_format: str, annotations: Dict[str, Any] = None) -> str: |
| """Export dataset""" |
| if not self.processed_items and not self.scraped_items: |
| raise ValueError("No data to export") |
| |
| items = self.processed_items or self.scraped_items |
| template = self.annotator.templates.get(template_name) |
| |
| if not template: |
| raise ValueError(f"Unknown template: {template_name}") |
| |
| logger.info(f"π€ Exporting {len(items)} items") |
| return self.exporter.export_dataset(items, template, export_format, annotations) |
|
|
| def create_modern_interface(): |
| """Create the modern Gradio interface""" |
| |
| |
| studio = DatasetStudio() |
| |
| |
| css = """ |
| .gradio-container { max-width: 1400px; margin: auto; } |
| .studio-header { |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; padding: 2rem; border-radius: 15px; |
| margin-bottom: 2rem; text-align: center; |
| } |
| .workflow-card { |
| background: #f8f9ff; border: 2px solid #e1e5ff; |
| border-radius: 12px; padding: 1.5rem; margin: 1rem 0; |
| } |
| .step-header { |
| font-size: 1.2em; font-weight: 600; color: #4c51bf; |
| margin-bottom: 1rem; |
| } |
| """ |
| |
| project_state = gr.State({}) |
| |
| with gr.Blocks(css=css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface: |
| |
| |
| gr.HTML(""" |
| <div class="studio-header"> |
| <h1>π AI Dataset Studio</h1> |
| <p>Create high-quality training datasets without coding</p> |
| </div> |
| """) |
| |
| with gr.Tabs() as main_tabs: |
| |
| |
| with gr.Tab("π― Project Setup"): |
| gr.HTML('<div class="step-header">Step 1: Create Your Project</div>') |
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| project_name = gr.Textbox( |
| label="Project Name", |
| placeholder="My Dataset Project", |
| value="News Analysis Dataset" |
| ) |
| |
| template_choice = gr.Radio( |
| choices=[ |
| ("π Text Classification", "text_classification"), |
| ("π Sentiment Analysis", "sentiment_analysis"), |
| ("π₯ Named Entity Recognition", "named_entity_recognition"), |
| ("β Question Answering", "question_answering"), |
| ("π Text Summarization", "summarization") |
| ], |
| label="Dataset Type", |
| value="text_classification" |
| ) |
| |
| create_project_btn = gr.Button("π Create Project", variant="primary") |
| project_status = gr.Markdown("") |
| |
| with gr.Column(scale=1): |
| gr.HTML(""" |
| <div class="workflow-card"> |
| <h3>π‘ Template Guide</h3> |
| <p><strong>Text Classification:</strong> Categorize content</p> |
| <p><strong>Sentiment Analysis:</strong> Analyze emotions</p> |
| <p><strong>Named Entity Recognition:</strong> Identify entities</p> |
| <p><strong>Question Answering:</strong> Create Q&A pairs</p> |
| <p><strong>Summarization:</strong> Generate summaries</p> |
| </div> |
| """) |
| |
| |
| with gr.Tab("π·οΈ Data Collection"): |
| gr.HTML('<div class="step-header">Step 2: Collect Your Data</div>') |
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| urls_input = gr.Textbox( |
| label="URLs to Scrape (one per line)", |
| placeholder="https://example.com/article1\nhttps://example.com/article2", |
| lines=8 |
| ) |
| |
| scrape_btn = gr.Button("π Start Scraping", variant="primary") |
| scraping_status = gr.Markdown("") |
| |
| with gr.Column(scale=1): |
| collection_stats = gr.HTML("") |
| |
| |
| with gr.Tab("βοΈ Data Processing"): |
| gr.HTML('<div class="step-header">Step 3: Clean & Enhance</div>') |
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| with gr.Row(): |
| with gr.Column(): |
| clean_text = gr.Checkbox(label="π§Ή Text Cleaning", value=True) |
| quality_filter = gr.Checkbox(label="π― Quality Filter", value=True) |
| detect_language = gr.Checkbox(label="π Language Detection", value=True) |
| |
| with gr.Column(): |
| add_sentiment = gr.Checkbox(label="π Sentiment Analysis", value=False) |
| extract_entities = gr.Checkbox(label="π₯ Entity Extraction", value=False) |
| |
| process_btn = gr.Button("βοΈ Process Data", variant="primary") |
| processing_status = gr.Markdown("") |
| |
| with gr.Column(scale=1): |
| processing_stats = gr.HTML("") |
| |
| |
| with gr.Tab("π Data Preview"): |
| gr.HTML('<div class="step-header">Step 4: Review Dataset</div>') |
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| refresh_btn = gr.Button("π Refresh Preview", variant="secondary") |
| |
| data_preview = gr.DataFrame( |
| headers=["Title", "Content Preview", "Words", "Quality", "URL"], |
| label="Dataset Preview" |
| ) |
| |
| with gr.Column(scale=1): |
| dataset_stats = gr.JSON(label="Statistics") |
| |
| |
| with gr.Tab("π€ Export Dataset"): |
| gr.HTML('<div class="step-header">Step 5: Export Your Dataset</div>') |
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| export_format = gr.Radio( |
| choices=[ |
| ("π JSON", "json"), |
| ("π CSV", "csv"), |
| ("π JSONL", "jsonl"), |
| ("π€ HuggingFace", "huggingface_datasets") |
| ], |
| label="Export Format", |
| value="json" |
| ) |
| |
| export_template = gr.Dropdown( |
| choices=[ |
| "text_classification", |
| "sentiment_analysis", |
| "named_entity_recognition", |
| "question_answering", |
| "summarization" |
| ], |
| label="Template", |
| value="text_classification" |
| ) |
| |
| export_btn = gr.Button("π€ Export Dataset", variant="primary") |
| export_status = gr.Markdown("") |
| export_file = gr.File(label="Download", visible=False) |
| |
| with gr.Column(scale=1): |
| gr.HTML(""" |
| <div class="workflow-card"> |
| <h3>π Export Info</h3> |
| <p><strong>JSON:</strong> Universal format</p> |
| <p><strong>CSV:</strong> Excel compatible</p> |
| <p><strong>JSONL:</strong> Line-separated</p> |
| <p><strong>HuggingFace:</strong> ML ready</p> |
| </div> |
| """) |
| |
| |
| def create_project(name, template): |
| if not name.strip(): |
| return "β Please enter a project name", {} |
| |
| project = studio.start_new_project(name.strip(), template) |
| status = f""" |
| β
**Project Created!** |
| |
| **Name:** {project['name']} |
| **Type:** {template.replace('_', ' ').title()} |
| **ID:** {project['id'][:8]}... |
| |
| π Next: Go to Data Collection tab |
| """ |
| return status, project |
| |
| def scrape_urls_handler(urls_text, project, progress=gr.Progress()): |
| if not project: |
| return "β Create a project first", "" |
| |
| urls = [url.strip() for url in urls_text.split('\n') if url.strip()] |
| if not urls: |
| return "β No URLs provided", "" |
| |
| def progress_callback(pct, msg): |
| progress(pct, desc=msg) |
| |
| success, errors = studio.scrape_urls(urls, progress_callback) |
| |
| if success > 0: |
| stats = f""" |
| <div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;"> |
| <h3>β
Scraping Complete</h3> |
| <p><strong>{success}</strong> items collected</p> |
| </div> |
| """ |
| |
| status = f""" |
| β
**Scraping Complete!** |
| |
| **Success:** {success} URLs |
| **Failed:** {len(urls) - success} URLs |
| |
| π Next: Go to Data Processing tab |
| """ |
| |
| return status, stats |
| else: |
| return f"β Scraping failed: {', '.join(errors)}", "" |
| |
| def process_data_handler(clean, quality, language, sentiment, entities, project): |
| if not project: |
| return "β Create a project first", "" |
| |
| if not studio.scraped_items: |
| return "β No data to process. Scrape URLs first.", "" |
| |
| options = { |
| 'clean_text': clean, |
| 'quality_filter': quality, |
| 'detect_language': language, |
| 'add_sentiment': sentiment, |
| 'extract_entities': entities |
| } |
| |
| processed = studio.process_data(options) |
| |
| if processed > 0: |
| stats = studio.get_data_statistics() |
| stats_html = f""" |
| <div style="background: #e8f5e8; padding: 1rem; border-radius: 8px;"> |
| <h3>βοΈ Processing Complete</h3> |
| <p><strong>{processed}</strong> items processed</p> |
| <p>Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p> |
| </div> |
| """ |
| |
| status = f""" |
| β
**Processing Complete!** |
| |
| **Processed:** {processed} items |
| **Avg Quality:** {stats.get('avg_quality_score', 0)} |
| |
| π Next: Check Data Preview tab |
| """ |
| |
| return status, stats_html |
| else: |
| return "β No items passed filters", "" |
| |
| def refresh_preview_handler(project): |
| if not project: |
| return None, {} |
| |
| preview = studio.get_data_preview() |
| stats = studio.get_data_statistics() |
| |
| if preview: |
| df_data = [] |
| for item in preview: |
| df_data.append([ |
| item['title'][:50] + "..." if len(item['title']) > 50 else item['title'], |
| item['content_preview'], |
| item['word_count'], |
| item['quality_score'], |
| item['url'][:50] + "..." if len(item['url']) > 50 else item['url'] |
| ]) |
| |
| return df_data, stats |
| |
| return None, {} |
| |
| def export_handler(format_type, template, project): |
| if not project: |
| return "β Create a project first", None |
| |
| if not studio.processed_items and not studio.scraped_items: |
| return "β No data to export", None |
| |
| try: |
| filename = studio.export_dataset(template, format_type) |
| |
| status = f""" |
| β
**Export Successful!** |
| |
| **Format:** {format_type} |
| **File:** {filename} |
| |
| π₯ Download link below |
| """ |
| |
| return status, filename |
| |
| except Exception as e: |
| return f"β Export failed: {str(e)}", None |
| |
| |
| create_project_btn.click( |
| fn=create_project, |
| inputs=[project_name, template_choice], |
| outputs=[project_status, project_state] |
| ) |
| |
| scrape_btn.click( |
| fn=scrape_urls_handler, |
| inputs=[urls_input, project_state], |
| outputs=[scraping_status, collection_stats] |
| ) |
| |
| process_btn.click( |
| fn=process_data_handler, |
| inputs=[clean_text, quality_filter, detect_language, |
| add_sentiment, extract_entities, project_state], |
| outputs=[processing_status, processing_stats] |
| ) |
| |
| refresh_btn.click( |
| fn=refresh_preview_handler, |
| inputs=[project_state], |
| outputs=[data_preview, dataset_stats] |
| ) |
| |
| export_btn.click( |
| fn=export_handler, |
| inputs=[export_format, export_template, project_state], |
| outputs=[export_status, export_file] |
| ) |
| |
| return interface |
|
|
| |
| if __name__ == "__main__": |
| logger.info("π Starting AI Dataset Studio...") |
| |
| |
| features = [] |
| if HAS_TRANSFORMERS: |
| features.append("β
AI Models") |
| else: |
| features.append("β οΈ Basic Processing") |
| |
| if HAS_NLTK: |
| features.append("β
Advanced NLP") |
| else: |
| features.append("β οΈ Basic NLP") |
| |
| if HAS_DATASETS: |
| features.append("β
HuggingFace Integration") |
| else: |
| features.append("β οΈ Standard Export") |
| |
| logger.info(f"π Features: {' | '.join(features)}") |
| |
| try: |
| |
| test_studio = DatasetStudio() |
| logger.info("β
DatasetStudio test passed") |
| |
| interface = create_modern_interface() |
| logger.info("β
Interface created successfully") |
| |
| interface.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=False, |
| show_error=True |
| ) |
| |
| except Exception as e: |
| logger.error(f"β Failed to launch: {e}") |
| logger.error("π‘ Try: python app_minimal.py") |
| raise |