Spaces:
Sleeping
Sleeping
| """Gradio demo application for RusCxnPipe.""" | |
| import gradio as gr | |
| import logging | |
| from typing import List, Dict, Any | |
| # Set up logging to avoid cluttering the interface | |
| logging.getLogger("transformers").setLevel(logging.WARNING) | |
| logging.getLogger("sentence_transformers").setLevel(logging.WARNING) | |
| logging.getLogger("torch").setLevel(logging.WARNING) | |
| try: | |
| from ruscxnpipe import RusCxnPipe, SpanPredictor | |
| except ImportError: | |
| # For development/testing when library isn't installed | |
| import sys | |
| import os | |
| sys.path.append( | |
| os.path.dirname( | |
| os.path.dirname( | |
| os.path.abspath(__file__)))) | |
| from ruscxnpipe import RusCxnPipe, SpanPredictor | |
| # Initialize models at startup | |
| print("🚀 Initializing RusCxnPipe models...") | |
| try: | |
| PIPELINE = RusCxnPipe( | |
| semantic_model="Futyn-Maker/ruscxn-embedder", | |
| classification_model="Futyn-Maker/ruscxn-classifier", | |
| span_model="Futyn-Maker/ruscxn-span-predictor", | |
| cache_dir="./cache" | |
| ) | |
| SPAN_PREDICTOR = SpanPredictor( | |
| model_name="Futyn-Maker/ruscxn-span-predictor") | |
| print("✅ Models initialized successfully!") | |
| MODELS_LOADED = True | |
| MODEL_ERROR = None | |
| except Exception as e: | |
| print(f"❌ Error initializing models: {str(e)}") | |
| PIPELINE = None | |
| SPAN_PREDICTOR = None | |
| MODELS_LOADED = False | |
| MODEL_ERROR = str(e) | |
| def highlight_span( | |
| text: str, | |
| span_start: int, | |
| span_end: int, | |
| span_string: str) -> str: | |
| """Highlight a span in text using HTML.""" | |
| if span_start < 0 or span_end > len(text) or span_start >= span_end: | |
| return text | |
| # Ensure the span matches | |
| actual_span = text[span_start:span_end] | |
| if actual_span.strip() != span_string.strip(): | |
| # Fallback: try to find the span in the text | |
| span_start = text.find(span_string) | |
| if span_start >= 0: | |
| span_end = span_start + len(span_string) | |
| else: | |
| return text | |
| # Create highlighted version | |
| before = text[:span_start] | |
| highlighted = text[span_start:span_end] | |
| after = text[span_end:] | |
| return f'{before}<mark style="background-color: #64b5f6; color: #1565c0; padding: 2px 4px; border-radius: 3px; font-weight: bold;">{highlighted}</mark>{after}' | |
| def create_construction_link(construction_id: str, pattern: str) -> str: | |
| """Create a clickable link to the construction page.""" | |
| url = f"https://constructicon.ruscorpora.ru/construction/{construction_id}" | |
| return f'<a href="{url}" target="_blank" style="color: #1976d2; text-decoration: none; font-weight: bold; border-bottom: 1px dotted #1976d2;">{pattern}</a>' | |
| def format_pipeline_results(results: Dict[str, Any]) -> str: | |
| """Format the pipeline results as HTML.""" | |
| if not results or not results['constructions']: | |
| return "<div style='padding: 20px; text-align: center; color: #666;'>No constructions found in the text.</div>" | |
| constructions = results['constructions'] | |
| original_text = results['example'] | |
| html_parts = [] | |
| html_parts.append("<div style='font-family: Arial, sans-serif;'>") | |
| # Header | |
| html_parts.append( | |
| "<h3 style='color: #333; margin-bottom: 20px;'>Found {} construction(s):</h3>".format( | |
| len(constructions))) | |
| # Process each construction | |
| for i, construction in enumerate(constructions, 1): | |
| construction_id = construction['id'] | |
| pattern = construction['pattern'] | |
| span_info = construction['span'] | |
| # Construction header with link | |
| html_parts.append( | |
| "<div style='margin-bottom: 25px; padding: 15px; border: 1px solid #e0e0e0; border-radius: 8px; background-color: #fafafa;'>") | |
| html_parts.append( | |
| f"<h4 style='margin: 0 0 10px 0; color: #333;'>{i}. {create_construction_link(construction_id, pattern)}</h4>") | |
| # Highlighted text | |
| if span_info['span_string']: | |
| highlighted_text = highlight_span( | |
| original_text, | |
| span_info['span_start'], | |
| span_info['span_end'], | |
| span_info['span_string'] | |
| ) | |
| html_parts.append( | |
| f"<div style='font-size: 16px; line-height: 1.5; margin-top: 10px; padding: 10px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #ddd; color: #333;'>{highlighted_text}</div>") | |
| # Span details | |
| html_parts.append( | |
| "<div style='margin-top: 8px; font-size: 12px; color: #666;'>") | |
| html_parts.append( | |
| f"Span: \"{span_info['span_string']}\" (positions {span_info['span_start']}-{span_info['span_end']})") | |
| html_parts.append("</div>") | |
| else: | |
| html_parts.append( | |
| f"<div style='font-size: 16px; line-height: 1.5; margin-top: 10px; padding: 10px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #ddd; color: #333;'>{original_text}</div>") | |
| html_parts.append( | |
| "<div style='margin-top: 8px; font-size: 12px; color: #999;'>No specific span identified</div>") | |
| html_parts.append("</div>") | |
| html_parts.append("</div>") | |
| return "".join(html_parts) | |
| def format_span_results(text: str, results: List[Dict[str, Any]]) -> str: | |
| """Format span prediction results as HTML.""" | |
| if not results or not results[0]['patterns']: | |
| return "<div style='padding: 20px; text-align: center; color: #666;'>No patterns processed.</div>" | |
| patterns = results[0]['patterns'] | |
| html_parts = [] | |
| html_parts.append("<div style='font-family: Arial, sans-serif;'>") | |
| # Header | |
| html_parts.append( | |
| f"<h3 style='color: #333; margin-bottom: 20px;'>Span predictions for {len(patterns)} pattern(s):</h3>") | |
| # Process each pattern | |
| for i, pattern_info in enumerate(patterns, 1): | |
| pattern = pattern_info['pattern'] | |
| span_info = pattern_info['span'] | |
| html_parts.append( | |
| "<div style='margin-bottom: 25px; padding: 15px; border: 1px solid #e0e0e0; border-radius: 8px; background-color: #fafafa;'>") | |
| html_parts.append( | |
| f"<h4 style='margin: 0 0 10px 0; color: #333;'>{i}. {pattern}</h4>") | |
| # Highlighted text | |
| if span_info['span_string']: | |
| highlighted_text = highlight_span( | |
| text, | |
| span_info['span_start'], | |
| span_info['span_end'], | |
| span_info['span_string'] | |
| ) | |
| html_parts.append( | |
| f"<div style='font-size: 16px; line-height: 1.5; margin-top: 10px; padding: 10px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #ddd; color: #333;'>{highlighted_text}</div>") | |
| # Span details | |
| html_parts.append( | |
| "<div style='margin-top: 8px; font-size: 12px; color: #666;'>") | |
| html_parts.append( | |
| f"Span: \"{span_info['span_string']}\" (positions {span_info['span_start']}-{span_info['span_end']})") | |
| html_parts.append("</div>") | |
| else: | |
| html_parts.append( | |
| f"<div style='font-size: 16px; line-height: 1.5; margin-top: 10px; padding: 10px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #ddd; color: #333;'>{text}</div>") | |
| html_parts.append( | |
| "<div style='margin-top: 8px; font-size: 12px; color: #999;'>No span found for this pattern</div>") | |
| html_parts.append("</div>") | |
| html_parts.append("</div>") | |
| return "".join(html_parts) | |
| def process_full_pipeline(text: str, n_candidates: int) -> str: | |
| """Process text through the full pipeline.""" | |
| if not text.strip(): | |
| return "<div style='padding: 20px; text-align: center; color: #666;'>Please enter some text to analyze.</div>" | |
| if not MODELS_LOADED: | |
| return f"<div style='color: red; padding: 20px;'>Error: {MODEL_ERROR}</div>" | |
| try: | |
| results = PIPELINE.process_text( | |
| text.strip(), n_candidates=n_candidates) | |
| return format_pipeline_results(results) | |
| except Exception as e: | |
| return f"<div style='color: red; padding: 20px;'>Error processing text: {str(e)}</div>" | |
| def process_span_prediction(text: str, patterns_text: str) -> str: | |
| """Process text for span prediction only.""" | |
| if not text.strip(): | |
| return "<div style='padding: 20px; text-align: center; color: #666;'>Please enter some text to analyze.</div>" | |
| if not patterns_text.strip(): | |
| return "<div style='padding: 20px; text-align: center; color: #666;'>Please enter some patterns to search for.</div>" | |
| if not MODELS_LOADED: | |
| return f"<div style='color: red; padding: 20px;'>Error: {MODEL_ERROR}</div>" | |
| # Parse patterns | |
| patterns = [p.strip() | |
| for p in patterns_text.strip().split('\n') if p.strip()] | |
| if not patterns: | |
| return "<div style='padding: 20px; text-align: center; color: #666;'>No valid patterns found.</div>" | |
| # Prepare input for span predictor | |
| examples_with_patterns = [{'example': text.strip(), | |
| 'patterns': [{'id': f'pattern_{i}', | |
| 'pattern': pattern} for i, | |
| pattern in enumerate(patterns)]}] | |
| try: | |
| results = SPAN_PREDICTOR.predict_spans(examples_with_patterns) | |
| return format_span_results(text.strip(), results) | |
| except Exception as e: | |
| return f"<div style='color: red; padding: 20px;'>Error processing spans: {str(e)}</div>" | |
| # Create the Gradio interface | |
| def create_demo(): | |
| """Create the Gradio demo interface.""" | |
| # Custom CSS | |
| css = """ | |
| .gradio-container { | |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
| } | |
| .gr-button { | |
| background: linear-gradient(90deg, #1976d2, #42a5f5); | |
| border: none; | |
| color: white; | |
| font-weight: bold; | |
| } | |
| .gr-button:hover { | |
| background: linear-gradient(90deg, #1565c0, #2196f3); | |
| } | |
| """ | |
| with gr.Blocks(css=css, title="RusCxnPipe Demo", theme=gr.themes.Soft()) as demo: | |
| # Header | |
| gr.Markdown(""" | |
| # 🔍 RusCxnPipe: Russian Constructicon Pattern Extractor | |
| **Automatically identify and locate Russian constructicon patterns in text** | |
| This tool uses advanced NLP models to find linguistic constructions from the Russian Constructicon database in your text. | |
| It performs semantic search, classification, and span prediction to provide accurate results with precise text locations. | |
| """) | |
| with gr.Tabs(): | |
| # Tab 1: Full Pipeline | |
| with gr.Tab("🚀 Full Pipeline", id="pipeline"): | |
| gr.Markdown(""" | |
| ### Complete Analysis | |
| Enter Russian text to automatically find all constructicon patterns present in it. | |
| The system will search through the database, classify candidates, and highlight exact locations. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox( | |
| label="Text", | |
| placeholder="Мои друзья разъехались и исчезли кто где.", | |
| lines=3, | |
| value="Мои друзья разъехались и исчезли кто где.") | |
| n_candidates = gr.Slider( | |
| minimum=5, | |
| maximum=50, | |
| value=15, | |
| step=5, | |
| label="Number of semantic search candidates", | |
| info="More candidates = more thorough search but slower processing and higher probability of false-positives" | |
| ) | |
| analyze_btn = gr.Button( | |
| "🔍 Analyze Text", variant="primary", size="lg") | |
| with gr.Column(scale=3): | |
| results_html = gr.HTML( | |
| label="Results", | |
| value="<div style='padding: 40px; text-align: center; color: #666; border: 2px dashed #ccc; border-radius: 8px;'>Enter text and click 'Analyze Text' to see results</div>" | |
| ) | |
| # Examples | |
| gr.Markdown("### 📝 Try these examples:") | |
| example_texts = [ | |
| "Мои друзья разъехались и исчезли кто где.", | |
| "Петр так и замер на месте.", | |
| "Таня танцевала без устали, танцевала со всеми подряд." | |
| ] | |
| with gr.Row(): | |
| for example in example_texts: | |
| gr.Button(f'"{example}"', size="sm").click( | |
| lambda x=example: x, outputs=text_input | |
| ) | |
| analyze_btn.click( | |
| fn=process_full_pipeline, | |
| inputs=[text_input, n_candidates], | |
| outputs=results_html | |
| ) | |
| # Tab 2: Span Prediction Only | |
| with gr.Tab("🎯 Span Prediction", id="spans"): | |
| gr.Markdown(""" | |
| ### Pattern Span Detection | |
| Enter text and specific patterns to find where exactly these patterns occur in the text. | |
| This skips the search and classification steps, directly predicting span boundaries. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| span_text_input = gr.Textbox( | |
| label="Text", | |
| placeholder="Мои друзья разъехались и исчезли кто где.", | |
| lines=3, | |
| value="Мои друзья разъехались и исчезли кто где.") | |
| patterns_input = gr.Textbox( | |
| label="Patterns (one per line)", | |
| placeholder="VP кто PronInt\nVP кто где", | |
| lines=5, | |
| value="VP кто PronInt\nVP кто где" | |
| ) | |
| predict_btn = gr.Button( | |
| "🎯 Predict Spans", variant="primary", size="lg") | |
| with gr.Column(scale=3): | |
| span_results_html = gr.HTML( | |
| label="Span Results", | |
| value="<div style='padding: 40px; text-align: center; color: #666; border: 2px dashed #ccc; border-radius: 8px;'>Enter text and patterns, then click 'Predict Spans' to see results</div>" | |
| ) | |
| predict_btn.click( | |
| fn=process_span_prediction, | |
| inputs=[span_text_input, patterns_input], | |
| outputs=span_results_html | |
| ) | |
| # Footer | |
| gr.Markdown(""" | |
| --- | |
| **About RusCxnPipe**: This tool is based on fine-tuned transformer models trained on Russian Constructicon data. | |
| The pipeline combines semantic search, classification, and span prediction to achieve high accuracy in construction detection. | |
| **Models used**: | |
| - Semantic: [ruscxn-embedder](https://huggingface.co/Futyn-Maker/ruscxn-embedder) | |
| - Classification: [ruscxn-classifier](https://huggingface.co/Futyn-Maker/ruscxn-classifier) | |
| - Span prediction: [ruscxn-span-predictor](https://huggingface.co/Futyn-Maker/ruscxn-span-predictor) | |
| 📚 [Russian Constructicon Database](https://constructicon.ruscorpora.ru/) | 💻 [Source Code](https://github.com/Futyn-Maker/ruscxnpipe) | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_demo() | |
| demo.launch( | |
| server_name="0.0.0.0", # For Hugging Face Spaces | |
| server_port=7860, # Default port for Spaces | |
| show_error=True | |
| ) | |