Futyn-Maker commited on
Commit
a25103f
·
1 Parent(s): 576952f

Initial app

Browse files
Files changed (2) hide show
  1. app.py +398 -0
  2. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio demo application for RusCxnPipe."""
2
+
3
+ import gradio as gr
4
+ import logging
5
+ from typing import List, Dict, Any
6
+
7
+
8
+ # Set up logging to avoid cluttering the interface
9
+ logging.getLogger("transformers").setLevel(logging.WARNING)
10
+ logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
11
+ logging.getLogger("torch").setLevel(logging.WARNING)
12
+
13
+ try:
14
+ from ruscxnpipe import RusCxnPipe, SpanPredictor
15
+ except ImportError:
16
+ # For development/testing when library isn't installed
17
+ import sys
18
+ import os
19
+ sys.path.append(
20
+ os.path.dirname(
21
+ os.path.dirname(
22
+ os.path.abspath(__file__))))
23
+ from ruscxnpipe import RusCxnPipe, SpanPredictor
24
+
25
+
26
+ # Initialize models at startup
27
+ print("🚀 Initializing RusCxnPipe models...")
28
+ try:
29
+ PIPELINE = RusCxnPipe(
30
+ semantic_model="Futyn-Maker/ruscxn-embedder",
31
+ classification_model="Futyn-Maker/ruscxn-classifier",
32
+ span_model="Futyn-Maker/ruscxn-span-predictor"
33
+ )
34
+ SPAN_PREDICTOR = SpanPredictor(
35
+ model_name="Futyn-Maker/ruscxn-span-predictor")
36
+ print("✅ Models initialized successfully!")
37
+ MODELS_LOADED = True
38
+ MODEL_ERROR = None
39
+ except Exception as e:
40
+ print(f"❌ Error initializing models: {str(e)}")
41
+ PIPELINE = None
42
+ SPAN_PREDICTOR = None
43
+ MODELS_LOADED = False
44
+ MODEL_ERROR = str(e)
45
+
46
+
47
+ def highlight_span(
48
+ text: str,
49
+ span_start: int,
50
+ span_end: int,
51
+ span_string: str) -> str:
52
+ """Highlight a span in text using HTML."""
53
+ if span_start < 0 or span_end > len(text) or span_start >= span_end:
54
+ return text
55
+
56
+ # Ensure the span matches
57
+ actual_span = text[span_start:span_end]
58
+ if actual_span.strip() != span_string.strip():
59
+ # Fallback: try to find the span in the text
60
+ span_start = text.find(span_string)
61
+ if span_start >= 0:
62
+ span_end = span_start + len(span_string)
63
+ else:
64
+ return text
65
+
66
+ # Create highlighted version
67
+ before = text[:span_start]
68
+ highlighted = text[span_start:span_end]
69
+ after = text[span_end:]
70
+
71
+ return f'{before}<mark style="background-color: #ffeb3b; padding: 2px 4px; border-radius: 3px; font-weight: bold;">{highlighted}</mark>{after}'
72
+
73
+
74
+ def create_construction_link(construction_id: str, pattern: str) -> str:
75
+ """Create a clickable link to the construction page."""
76
+ url = f"https://constructicon.ruscorpora.ru/construction/{construction_id}"
77
+ return f'<a href="{url}" target="_blank" style="color: #1976d2; text-decoration: none; font-weight: bold; border-bottom: 1px dotted #1976d2;">{pattern}</a>'
78
+
79
+
80
+ def format_pipeline_results(results: Dict[str, Any]) -> str:
81
+ """Format the pipeline results as HTML."""
82
+ if not results or not results['constructions']:
83
+ return "<div style='padding: 20px; text-align: center; color: #666;'>No constructions found in the text.</div>"
84
+
85
+ constructions = results['constructions']
86
+ original_text = results['example']
87
+
88
+ html_parts = []
89
+ html_parts.append("<div style='font-family: Arial, sans-serif;'>")
90
+
91
+ # Header
92
+ html_parts.append(
93
+ f"<h3 style='color: #333; margin-bottom: 20px;'>Found {
94
+ len(constructions)} construction(s):</h3>")
95
+
96
+ # Process each construction
97
+ for i, construction in enumerate(constructions, 1):
98
+ construction_id = construction['id']
99
+ pattern = construction['pattern']
100
+ span_info = construction['span']
101
+
102
+ # Construction header with link
103
+ html_parts.append(
104
+ "<div style='margin-bottom: 25px; padding: 15px; border: 1px solid #e0e0e0; border-radius: 8px; background-color: #fafafa;'>")
105
+ html_parts.append(
106
+ f"<h4 style='margin: 0 0 10px 0; color: #333;'>{i}. {
107
+ create_construction_link(
108
+ construction_id, pattern)}</h4>")
109
+
110
+ # Highlighted text
111
+ if span_info['span_string']:
112
+ highlighted_text = highlight_span(
113
+ original_text,
114
+ span_info['span_start'],
115
+ span_info['span_end'],
116
+ span_info['span_string']
117
+ )
118
+ html_parts.append(
119
+ f"<div style='font-size: 16px; line-height: 1.5; margin-top: 10px; padding: 10px; background-color: white; border-radius: 4px; border: 1px solid #ddd;'>{highlighted_text}</div>")
120
+
121
+ # Span details
122
+ html_parts.append(
123
+ "<div style='margin-top: 8px; font-size: 12px; color: #666;'>")
124
+ html_parts.append(
125
+ f"Span: \"{
126
+ span_info['span_string']}\" (positions {
127
+ span_info['span_start']}-{
128
+ span_info['span_end']})")
129
+ html_parts.append("</div>")
130
+ else:
131
+ html_parts.append(
132
+ f"<div style='font-size: 16px; line-height: 1.5; margin-top: 10px; padding: 10px; background-color: white; border-radius: 4px; border: 1px solid #ddd;'>{original_text}</div>")
133
+ html_parts.append(
134
+ "<div style='margin-top: 8px; font-size: 12px; color: #999;'>No specific span identified</div>")
135
+
136
+ html_parts.append("</div>")
137
+
138
+ html_parts.append("</div>")
139
+ return "".join(html_parts)
140
+
141
+
142
+ def format_span_results(text: str, results: List[Dict[str, Any]]) -> str:
143
+ """Format span prediction results as HTML."""
144
+ if not results or not results[0]['patterns']:
145
+ return "<div style='padding: 20px; text-align: center; color: #666;'>No patterns processed.</div>"
146
+
147
+ patterns = results[0]['patterns']
148
+
149
+ html_parts = []
150
+ html_parts.append("<div style='font-family: Arial, sans-serif;'>")
151
+
152
+ # Header
153
+ html_parts.append(
154
+ f"<h3 style='color: #333; margin-bottom: 20px;'>Span predictions for {
155
+ len(patterns)} pattern(s):</h3>")
156
+
157
+ # Process each pattern
158
+ for i, pattern_info in enumerate(patterns, 1):
159
+ pattern = pattern_info['pattern']
160
+ span_info = pattern_info['span']
161
+
162
+ html_parts.append(
163
+ "<div style='margin-bottom: 25px; padding: 15px; border: 1px solid #e0e0e0; border-radius: 8px; background-color: #fafafa;'>")
164
+ html_parts.append(
165
+ f"<h4 style='margin: 0 0 10px 0; color: #333;'>{i}. {pattern}</h4>")
166
+
167
+ # Highlighted text
168
+ if span_info['span_string']:
169
+ highlighted_text = highlight_span(
170
+ text,
171
+ span_info['span_start'],
172
+ span_info['span_end'],
173
+ span_info['span_string']
174
+ )
175
+ html_parts.append(
176
+ f"<div style='font-size: 16px; line-height: 1.5; margin-top: 10px; padding: 10px; background-color: white; border-radius: 4px; border: 1px solid #ddd;'>{highlighted_text}</div>")
177
+
178
+ # Span details
179
+ html_parts.append(
180
+ "<div style='margin-top: 8px; font-size: 12px; color: #666;'>")
181
+ html_parts.append(
182
+ f"Span: \"{
183
+ span_info['span_string']}\" (positions {
184
+ span_info['span_start']}-{
185
+ span_info['span_end']})")
186
+ html_parts.append("</div>")
187
+ else:
188
+ html_parts.append(
189
+ f"<div style='font-size: 16px; line-height: 1.5; margin-top: 10px; padding: 10px; background-color: white; border-radius: 4px; border: 1px solid #ddd;'>{text}</div>")
190
+ html_parts.append(
191
+ "<div style='margin-top: 8px; font-size: 12px; color: #999;'>No span found for this pattern</div>")
192
+
193
+ html_parts.append("</div>")
194
+
195
+ html_parts.append("</div>")
196
+ return "".join(html_parts)
197
+
198
+
199
+ def process_full_pipeline(text: str, n_candidates: int) -> str:
200
+ """Process text through the full pipeline."""
201
+ if not text.strip():
202
+ return "<div style='padding: 20px; text-align: center; color: #666;'>Please enter some text to analyze.</div>"
203
+
204
+ if not MODELS_LOADED:
205
+ return f"<div style='color: red; padding: 20px;'>Error: {MODEL_ERROR}</div>"
206
+
207
+ try:
208
+ results = PIPELINE.process_text(
209
+ text.strip(), n_candidates=n_candidates)
210
+ return format_pipeline_results(results)
211
+ except Exception as e:
212
+ return f"<div style='color: red; padding: 20px;'>Error processing text: {
213
+ str(e)}</div>"
214
+
215
+
216
+ def process_span_prediction(text: str, patterns_text: str) -> str:
217
+ """Process text for span prediction only."""
218
+ if not text.strip():
219
+ return "<div style='padding: 20px; text-align: center; color: #666;'>Please enter some text to analyze.</div>"
220
+
221
+ if not patterns_text.strip():
222
+ return "<div style='padding: 20px; text-align: center; color: #666;'>Please enter some patterns to search for.</div>"
223
+
224
+ if not MODELS_LOADED:
225
+ return f"<div style='color: red; padding: 20px;'>Error: {MODEL_ERROR}</div>"
226
+
227
+ # Parse patterns
228
+ patterns = [p.strip()
229
+ for p in patterns_text.strip().split('\n') if p.strip()]
230
+ if not patterns:
231
+ return "<div style='padding: 20px; text-align: center; color: #666;'>No valid patterns found.</div>"
232
+
233
+ # Prepare input for span predictor
234
+ examples_with_patterns = [{'example': text.strip(),
235
+ 'patterns': [{'id': f'pattern_{i}',
236
+ 'pattern': pattern} for i,
237
+ pattern in enumerate(patterns)]}]
238
+
239
+ try:
240
+ results = SPAN_PREDICTOR.predict_spans(examples_with_patterns)
241
+ return format_span_results(text.strip(), results)
242
+ except Exception as e:
243
+ return f"<div style='color: red; padding: 20px;'>Error processing spans: {
244
+ str(e)}</div>"
245
+
246
+ # Create the Gradio interface
247
+
248
+
249
+ def create_demo():
250
+ """Create the Gradio demo interface."""
251
+
252
+ # Custom CSS
253
+ css = """
254
+ .gradio-container {
255
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
256
+ }
257
+ .gr-button {
258
+ background: linear-gradient(90deg, #1976d2, #42a5f5);
259
+ border: none;
260
+ color: white;
261
+ font-weight: bold;
262
+ }
263
+ .gr-button:hover {
264
+ background: linear-gradient(90deg, #1565c0, #2196f3);
265
+ }
266
+ """
267
+
268
+ with gr.Blocks(css=css, title="RusCxnPipe Demo", theme=gr.themes.Soft()) as demo:
269
+
270
+ # Header
271
+ gr.Markdown("""
272
+ # 🔍 RusCxnPipe: Russian Constructicon Pattern Extractor
273
+
274
+ **Automatically identify and locate Russian constructicon patterns in text**
275
+
276
+ This tool uses advanced NLP models to find linguistic constructions from the Russian Constructicon database in your text.
277
+ It performs semantic search, classification, and span prediction to provide accurate results with precise text locations.
278
+
279
+ """)
280
+
281
+ with gr.Tabs():
282
+ # Tab 1: Full Pipeline
283
+ with gr.Tab("🚀 Full Pipeline", id="pipeline"):
284
+ gr.Markdown("""
285
+ ### Complete Analysis
286
+ Enter Russian text to automatically find all constructicon patterns present in it.
287
+ The system will search through the database, classify candidates, and highlight exact locations.
288
+ """)
289
+
290
+ with gr.Row():
291
+ with gr.Column(scale=2):
292
+ text_input = gr.Textbox(
293
+ label="Text",
294
+ placeholder="Мои друзья разъехались и исчезли кто где.",
295
+ lines=3,
296
+ value="Мои друзья разъехались и исчезли кто где.")
297
+
298
+ n_candidates = gr.Slider(
299
+ minimum=5,
300
+ maximum=50,
301
+ value=15,
302
+ step=5,
303
+ label="Number of semantic search candidates",
304
+ info="More candidates = more thorough search but slower processing and higher probability of false-positives"
305
+ )
306
+
307
+ analyze_btn = gr.Button(
308
+ "🔍 Analyze Text", variant="primary", size="lg")
309
+
310
+ with gr.Column(scale=3):
311
+ results_html = gr.HTML(
312
+ label="Results",
313
+ value="<div style='padding: 40px; text-align: center; color: #666; border: 2px dashed #ccc; border-radius: 8px;'>Enter text and click 'Analyze Text' to see results</div>"
314
+ )
315
+
316
+ # Examples
317
+ gr.Markdown("### 📝 Try these examples:")
318
+ example_texts = [
319
+ "Мои друзья разъехались и исчезли кто где.",
320
+ "Мягко говоря, это была ошибка.",
321
+ "Петр так и замер на месте.",
322
+ "Таня танцевала без устали, танцевала со всеми подряд."
323
+ ]
324
+
325
+ with gr.Row():
326
+ for example in example_texts:
327
+ gr.Button(f'"{example}"', size="sm").click(
328
+ lambda x=example: x, outputs=text_input
329
+ )
330
+
331
+ analyze_btn.click(
332
+ fn=process_full_pipeline,
333
+ inputs=[text_input, n_candidates],
334
+ outputs=results_html
335
+ )
336
+
337
+ # Tab 2: Span Prediction Only
338
+ with gr.Tab("🎯 Span Prediction", id="spans"):
339
+ gr.Markdown("""
340
+ ### Pattern Span Detection
341
+ Enter text and specific patterns to find where exactly these patterns occur in the text.
342
+ This skips the search and classification steps, directly predicting span boundaries.
343
+ """)
344
+
345
+ with gr.Row():
346
+ with gr.Column(scale=2):
347
+ span_text_input = gr.Textbox(
348
+ label="Text",
349
+ placeholder="Мои друзья разъехались и исчезли кто где.",
350
+ lines=3,
351
+ value="Мои друзья разъехались и исчезли кто где.")
352
+
353
+ patterns_input = gr.Textbox(
354
+ label="Patterns (one per line)",
355
+ placeholder="VP кто PronInt\nVP кто где",
356
+ lines=5,
357
+ value="VP кто PronInt\nVP кто где"
358
+ )
359
+
360
+ predict_btn = gr.Button(
361
+ "🎯 Predict Spans", variant="primary", size="lg")
362
+
363
+ with gr.Column(scale=3):
364
+ span_results_html = gr.HTML(
365
+ label="Span Results",
366
+ value="<div style='padding: 40px; text-align: center; color: #666; border: 2px dashed #ccc; border-radius: 8px;'>Enter text and patterns, then click 'Predict Spans' to see results</div>"
367
+ )
368
+
369
+ predict_btn.click(
370
+ fn=process_span_prediction,
371
+ inputs=[span_text_input, patterns_input],
372
+ outputs=span_results_html
373
+ )
374
+
375
+ # Footer
376
+ gr.Markdown("""
377
+ ---
378
+ **About RusCxnPipe**: This tool is based on fine-tuned transformer models trained on Russian Constructicon data.
379
+ The pipeline combines semantic search, classification, and span prediction to achieve high accuracy in construction detection.
380
+
381
+ **Models used**:
382
+ - Semantic: [ruscxn-embedder](https://huggingface.co/Futyn-Maker/ruscxn-embedder)
383
+ - Classification: [ruscxn-classifier](https://huggingface.co/Futyn-Maker/ruscxn-classifier)
384
+ - Span prediction: [ruscxn-span-predictor](https://huggingface.co/Futyn-Maker/ruscxn-span-predictor)
385
+
386
+ 📚 [Russian Constructicon Database](https://constructicon.ruscorpora.ru/) | 💻 [Source Code](https://github.com/Futyn-Maker/ruscxnpipe)
387
+ """)
388
+
389
+ return demo
390
+
391
+
392
+ if __name__ == "__main__":
393
+ demo = create_demo()
394
+ demo.launch(
395
+ server_name="0.0.0.0", # For Hugging Face Spaces
396
+ server_port=7860, # Default port for Spaces
397
+ show_error=True
398
+ )
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ git+https://github.com/Futyn-Maker/RusCxnPipe.git