| import re |
| import pandas as pd |
| import pymorphy2 |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.metrics.pairwise import cosine_similarity |
| from sklearn.metrics import accuracy_score |
| from translate import Translator |
| import streamlit as st |
| import language_tool_python |
| import langid |
|
|
|
|
| |
| morph = pymorphy2.MorphAnalyzer() |
| tool = language_tool_python.LanguageToolPublicAPI('ru') |
| nkz_list = pd.read_csv('filtered_nkz.csv') |
|
|
|
|
|
|
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| from translate import Translator |
|
|
| def detect_language(text): |
| lang, _ = langid.classify(text) |
| return lang |
| def translate_text(text, from_lang='kk', to_lang='ru'): |
| """ |
| Перевод текста с одного языка на другой. |
| |
| :param text: Исходный текст для перевода. |
| :param from_lang: Язык исходного текста (по умолчанию казахский 'kk'). |
| :param to_lang: Язык перевода (по умолчанию русский 'ru'). |
| :return: Переведённый текст. |
| """ |
| try: |
| translator = Translator(from_lang=from_lang, to_lang=to_lang) |
| translated_text = translator.translate(text) |
| return translated_text |
| except Exception as e: |
| print(f"Ошибка перевода: {e}") |
| return text |
|
|
| |
| kazakh_letters = set("әғқңөұүі") |
|
|
|
|
|
|
| def preprocess_text(text): |
| """Функция для предобработки текста: перевод, очистка, приведение к нижнему регистру, лемматизация.""" |
| if not isinstance(text, str): |
| return "" |
| |
| |
| if any(char in kazakh_letters for char in text.lower()): |
| text = translate_text(text) |
| |
| |
| text = text.lower().strip() |
| text = re.sub(r'[^\w\s]', '', text) |
| text = re.sub(r'[a-zA-Z0-9]', '', text) |
| words = text.split() |
| lemmas = [morph.parse(word)[0].normal_form for word in words] |
| return ' '.join(lemmas) |
|
|
| |
|
|
|
|
| def text_correct(text): |
| lng = detect_language(text) |
| if lng == 'kk': |
| text = translate_text(text) |
| matches = tool.check(text) |
| text = language_tool_python.utils.correct(text, matches) |
| return text |
|
|
| def find_best_matches(profession, nkz_list, vectorizer, tfidf_nkz, top_n=10): |
| """Находит топ-10 наилучших соответствий для одной профессии в списке НКЗ.""" |
| |
| processed_profession = preprocess_text(profession) |
| processed_profession = text_correct(processed_profession) |
| print(processed_profession) |
| |
| tfidf_profession = vectorizer.transform([processed_profession]) |
|
|
| |
| similarity = cosine_similarity(tfidf_profession, tfidf_nkz) |
|
|
| |
| top_n_idx = similarity[0].argsort()[-top_n:][::-1] |
|
|
| |
| top_matches = [] |
| for idx in top_n_idx: |
| top_matches.append({ |
| 'profession': processed_profession, |
| 'nkz_match': nkz_list.iloc[idx]['NAME_RU2'], |
| 'nkz_code': nkz_list.iloc[idx]['CODE'], |
| 'similarity': similarity[0][idx] |
| }) |
| dfs = pd.DataFrame(top_matches) |
| return dfs |
|
|
|
|
|
|
| nkz_list['cleaned'] = nkz_list['NAME_RU2'].apply(preprocess_text) |
|
|
| vectorizer = TfidfVectorizer() |
| tfidf_nkz = vectorizer.fit_transform(nkz_list['cleaned']) |
| |
| st.title("Occupation Similarity Finder") |
|
|
| text1 = st.text_input("Enter the occupation to compare:", "Оператор пульта управления") |
|
|
| if st.button("Find Similar Occupations"): |
| try: |
| |
| results = find_best_matches(text1, nkz_list, vectorizer, tfidf_nkz, top_n=10) |
| |
| |
| |
| if results is not None: |
| st.write("Similar Occupations:") |
| st.dataframe(results) |
| |
| else: |
| st.warning("No similar occupations found.") |
| except Exception as e: |
| st.error(f"An error occurred: {e}") |
|
|