Spaces:
Running
Running
Commit ·
cdaad51
1
Parent(s): 59bee7f
3.57
Browse files
app.py
CHANGED
|
@@ -73,7 +73,7 @@ class FallbackLLMSystem:
|
|
| 73 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 74 |
self.model = self.model.to(self.device)
|
| 75 |
|
| 76 |
-
st.success(f"
|
| 77 |
|
| 78 |
except Exception as e:
|
| 79 |
st.error(f"Error initializing MT5: {str(e)}")
|
|
@@ -230,10 +230,10 @@ class QwenSystem:
|
|
| 230 |
)
|
| 231 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 232 |
|
| 233 |
-
st.success(f"
|
| 234 |
|
| 235 |
except Exception as e:
|
| 236 |
-
st.error(f"
|
| 237 |
raise
|
| 238 |
|
| 239 |
def invoke(self, messages):
|
|
@@ -347,9 +347,9 @@ class EventDetectionSystem:
|
|
| 347 |
model="yiyanghkust/finbert-tone",
|
| 348 |
return_all_scores=True
|
| 349 |
)
|
| 350 |
-
st.success("BERT
|
| 351 |
except Exception as e:
|
| 352 |
-
st.error(f"
|
| 353 |
raise
|
| 354 |
|
| 355 |
def detect_event_type(self, text, entity):
|
|
@@ -404,70 +404,106 @@ class EventDetectionSystem:
|
|
| 404 |
|
| 405 |
class TranslationSystem:
|
| 406 |
def __init__(self):
|
| 407 |
-
"""Initialize translation system using Helsinki NLP model"""
|
| 408 |
try:
|
| 409 |
self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
|
| 410 |
-
|
|
|
|
|
|
|
|
|
|
| 411 |
except Exception as e:
|
| 412 |
-
st.error(f"
|
| 413 |
raise
|
| 414 |
-
|
| 415 |
-
def
|
| 416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
return str(text) if pd.notna(text) else ""
|
| 418 |
-
|
| 419 |
text = str(text).strip()
|
| 420 |
if not text:
|
| 421 |
return ""
|
| 422 |
-
|
| 423 |
try:
|
| 424 |
-
|
| 425 |
-
chunks = self._split_into_chunks(text
|
| 426 |
translated_chunks = []
|
| 427 |
-
|
|
|
|
| 428 |
for chunk in chunks:
|
| 429 |
if not chunk.strip():
|
| 430 |
continue
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
except Exception as e:
|
| 446 |
st.warning(f"Translation error: {str(e)}")
|
| 447 |
-
return text
|
| 448 |
-
|
| 449 |
-
def _split_into_chunks(self, text, max_length):
|
| 450 |
-
sentences = []
|
| 451 |
-
for s in text.replace('!', '.').replace('?', '.').split('.'):
|
| 452 |
-
s = s.strip()
|
| 453 |
-
if s:
|
| 454 |
-
if len(s) > max_length:
|
| 455 |
-
# Split long sentences into smaller chunks
|
| 456 |
-
words = s.split()
|
| 457 |
-
current_chunk = []
|
| 458 |
-
current_length = 0
|
| 459 |
-
for word in words:
|
| 460 |
-
if current_length + len(word) > max_length:
|
| 461 |
-
sentences.append(' '.join(current_chunk))
|
| 462 |
-
current_chunk = [word]
|
| 463 |
-
current_length = len(word)
|
| 464 |
-
else:
|
| 465 |
-
current_chunk.append(word)
|
| 466 |
-
current_length += len(word) + 1
|
| 467 |
-
if current_chunk:
|
| 468 |
-
sentences.append(' '.join(current_chunk))
|
| 469 |
-
else:
|
| 470 |
-
sentences.append(s)
|
| 471 |
|
| 472 |
|
| 473 |
|
|
@@ -962,7 +998,7 @@ def main():
|
|
| 962 |
st.set_page_config(layout="wide")
|
| 963 |
|
| 964 |
with st.sidebar:
|
| 965 |
-
st.title("::: AI-анализ мониторинга новостей (v.3.
|
| 966 |
st.subheader("по материалам СКАН-ИНТЕРФАКС")
|
| 967 |
|
| 968 |
model_choice = st.radio(
|
|
|
|
| 73 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 74 |
self.model = self.model.to(self.device)
|
| 75 |
|
| 76 |
+
st.success(f"Запустил MT5-модель на {self.device}")
|
| 77 |
|
| 78 |
except Exception as e:
|
| 79 |
st.error(f"Error initializing MT5: {str(e)}")
|
|
|
|
| 230 |
)
|
| 231 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 232 |
|
| 233 |
+
st.success(f"запустил Qwen2.5 model")
|
| 234 |
|
| 235 |
except Exception as e:
|
| 236 |
+
st.error(f"ошибка запуска Qwen2.5: {str(e)}")
|
| 237 |
raise
|
| 238 |
|
| 239 |
def invoke(self, messages):
|
|
|
|
| 347 |
model="yiyanghkust/finbert-tone",
|
| 348 |
return_all_scores=True
|
| 349 |
)
|
| 350 |
+
st.success("BERT-модели запущены для детекции новостей")
|
| 351 |
except Exception as e:
|
| 352 |
+
st.error(f"Ошибка запуска BERT: {str(e)}")
|
| 353 |
raise
|
| 354 |
|
| 355 |
def detect_event_type(self, text, entity):
|
|
|
|
| 404 |
|
| 405 |
class TranslationSystem:
|
| 406 |
def __init__(self):
|
| 407 |
+
"""Initialize translation system using Helsinki NLP model with fallback options"""
|
| 408 |
try:
|
| 409 |
self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
|
| 410 |
+
# Initialize fallback translator
|
| 411 |
+
self.fallback_translator = GoogleTranslator(source='ru', target='en')
|
| 412 |
+
self.legacy_translator = LegacyTranslator()
|
| 413 |
+
st.success("Запустил систему перевода")
|
| 414 |
except Exception as e:
|
| 415 |
+
st.error(f"Ошибка запуска перевода: {str(e)}")
|
| 416 |
raise
|
| 417 |
+
|
| 418 |
+
def _split_into_chunks(self, text: str, max_length: int = 450) -> list:
|
| 419 |
+
"""Split text into chunks while preserving word boundaries"""
|
| 420 |
+
words = text.split()
|
| 421 |
+
chunks = []
|
| 422 |
+
current_chunk = []
|
| 423 |
+
current_length = 0
|
| 424 |
+
|
| 425 |
+
for word in words:
|
| 426 |
+
word_length = len(word)
|
| 427 |
+
if current_length + word_length + 1 <= max_length:
|
| 428 |
+
current_chunk.append(word)
|
| 429 |
+
current_length += word_length + 1
|
| 430 |
+
else:
|
| 431 |
+
if current_chunk:
|
| 432 |
+
chunks.append(' '.join(current_chunk))
|
| 433 |
+
current_chunk = [word]
|
| 434 |
+
current_length = word_length
|
| 435 |
+
|
| 436 |
+
if current_chunk:
|
| 437 |
+
chunks.append(' '.join(current_chunk))
|
| 438 |
+
|
| 439 |
+
return chunks
|
| 440 |
+
|
| 441 |
+
def _translate_chunk_with_retries(self, chunk: str, max_retries: int = 3) -> str:
|
| 442 |
+
"""Attempt translation with multiple fallback options"""
|
| 443 |
+
if not chunk or not chunk.strip():
|
| 444 |
+
return ""
|
| 445 |
+
|
| 446 |
+
for attempt in range(max_retries):
|
| 447 |
+
try:
|
| 448 |
+
# First try Helsinki NLP
|
| 449 |
+
result = self.translator(chunk, max_length=512)
|
| 450 |
+
if result and isinstance(result, list) and len(result) > 0:
|
| 451 |
+
translated = result[0].get('translation_text')
|
| 452 |
+
if translated and isinstance(translated, str):
|
| 453 |
+
return translated
|
| 454 |
+
|
| 455 |
+
# First fallback: Google Translator
|
| 456 |
+
translated = self.fallback_translator.translate(chunk)
|
| 457 |
+
if translated and isinstance(translated, str):
|
| 458 |
+
return translated
|
| 459 |
+
|
| 460 |
+
# Second fallback: Legacy Google Translator
|
| 461 |
+
translated = self.legacy_translator.translate(chunk, src='ru', dest='en').text
|
| 462 |
+
if translated and isinstance(translated, str):
|
| 463 |
+
return translated
|
| 464 |
+
|
| 465 |
+
except Exception as e:
|
| 466 |
+
if attempt == max_retries - 1:
|
| 467 |
+
st.warning(f"Попробовал перевести {max_retries} раз, не преуспел: {str(e)}")
|
| 468 |
+
time.sleep(1 * (attempt + 1)) # Exponential backoff
|
| 469 |
+
|
| 470 |
+
return chunk # Return original text if all translation attempts fail
|
| 471 |
+
|
| 472 |
+
def translate_text(self, text: str) -> str:
|
| 473 |
+
"""Translate text with robust error handling and validation"""
|
| 474 |
+
# Input validation
|
| 475 |
+
if pd.isna(text) or not isinstance(text, str):
|
| 476 |
return str(text) if pd.notna(text) else ""
|
| 477 |
+
|
| 478 |
text = str(text).strip()
|
| 479 |
if not text:
|
| 480 |
return ""
|
| 481 |
+
|
| 482 |
try:
|
| 483 |
+
# Split into manageable chunks
|
| 484 |
+
chunks = self._split_into_chunks(text)
|
| 485 |
translated_chunks = []
|
| 486 |
+
|
| 487 |
+
# Process each chunk with validation
|
| 488 |
for chunk in chunks:
|
| 489 |
if not chunk.strip():
|
| 490 |
continue
|
| 491 |
+
|
| 492 |
+
translated_chunk = self._translate_chunk_with_retries(chunk)
|
| 493 |
+
if translated_chunk: # Only add non-empty translations
|
| 494 |
+
translated_chunks.append(translated_chunk)
|
| 495 |
+
time.sleep(0.1) # Rate limiting
|
| 496 |
+
|
| 497 |
+
# Final validation of results
|
| 498 |
+
if not translated_chunks:
|
| 499 |
+
return text # Return original if no translations succeeded
|
| 500 |
+
|
| 501 |
+
result = ' '.join(translated_chunks)
|
| 502 |
+
return result if result.strip() else text
|
| 503 |
+
|
|
|
|
| 504 |
except Exception as e:
|
| 505 |
st.warning(f"Translation error: {str(e)}")
|
| 506 |
+
return text # Return original text on error
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
|
| 508 |
|
| 509 |
|
|
|
|
| 998 |
st.set_page_config(layout="wide")
|
| 999 |
|
| 1000 |
with st.sidebar:
|
| 1001 |
+
st.title("::: AI-анализ мониторинга новостей (v.3.57):::")
|
| 1002 |
st.subheader("по материалам СКАН-ИНТЕРФАКС")
|
| 1003 |
|
| 1004 |
model_choice = st.radio(
|