Spaces:
Runtime error
Runtime error
| import json as js | |
| import os | |
| import re | |
| from typing import List | |
| import fasttext | |
| import gradio as gr | |
| import joblib | |
| import omikuji | |
| from huggingface_hub import snapshot_download | |
| from install_packages import download_model | |
| download_model('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin') | |
| # Download the model files from Hugging Face | |
| for repo_id in ['kapllan/omikuji-bonsai-parliament-de-spacy', 'kapllan/omikuji-bonsai-parliament-fr-spacy', | |
| 'kapllan/omikuji-bonsai-parliament-it-spacy']: | |
| if not os.path.exists(repo_id): | |
| os.makedirs(repo_id) | |
| model_dir = snapshot_download(repo_id=repo_id, local_dir=repo_id) | |
| lang_model = fasttext.load_model('lid.176.bin') | |
| with open('./id2label.json', 'r') as f: | |
| id2label = js.load(f) | |
| def map_language(language: str) -> str: | |
| language_mapping = {'de': 'German', | |
| 'it': 'Italian', | |
| 'fr': 'French'} | |
| if language in language_mapping.keys(): | |
| return language_mapping[language] | |
| else: | |
| return language | |
| def find_model(language: str): | |
| vectorizer, model = None, None | |
| if language in ['de', 'fr', 'it']: | |
| path_to_vectorizer = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/vectorizer' | |
| path_to_model = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/omikuji-model' | |
| vectorizer = joblib.load(path_to_vectorizer) | |
| model = omikuji.Model.load(path_to_model) | |
| return vectorizer, model | |
| def predict_lang(text: str) -> str: | |
| text = re.sub(r'\n', '', text) # Remove linebreaks because fasttext cannot process that otherwise | |
| predictions = lang_model.predict(text, k=1) # returns top 2 matching languages | |
| language = predictions[0][0] # returns top 2 matching languages | |
| language = re.sub(r'__label__', '', language) # returns top 2 matching languages | |
| return language | |
| def predict_topic(text: str) -> [List[str], str]: | |
| results = [] | |
| language = predict_lang(text) | |
| vectorizer, model = find_model(language) | |
| language = map_language(language) | |
| if vectorizer is not None: | |
| texts = [text] | |
| vector = vectorizer.transform(texts) | |
| for row in vector: | |
| if row.nnz == 0: # All zero vector, empty result | |
| continue | |
| feature_values = [(col, row[0, col]) for col in row.nonzero()[1]] | |
| for subj_id, score in model.predict(feature_values, top_k=1000): | |
| results.append((id2label[str(subj_id)], score)) | |
| return results, language | |
| def topic_modeling(text: str, threshold: float) -> [List[str], str]: | |
| # Prepare labels and scores for the plot | |
| sorted_topics, language = predict_topic(text) | |
| if len(sorted_topics) > 0 and language in ['German', 'French', 'Italian']: | |
| sorted_topics = [t for t in sorted_topics if t[1] >= threshold] | |
| else: | |
| sorted_topics = [] | |
| return sorted_topics, language | |
| with gr.Blocks() as iface: | |
| gr.Markdown("# Topic Modeling") | |
| gr.Markdown("Enter a document and get each topic along with its score.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox(lines=10, placeholder="Enter a document") | |
| submit_button = gr.Button("Submit") | |
| threshold_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Score Threshold", value=0.0) | |
| language_text = gr.Textbox(lines=1, placeholder="Detected language will be shown here...", | |
| interactive=False, label="Detected Language") | |
| with gr.Column(): | |
| output_data = gr.Dataframe(headers=["Label", "Score"]) | |
| submit_button.click(topic_modeling, inputs=[input_text, threshold_slider], outputs=[output_data, language_text]) | |
| # Launch the app | |
| iface.launch(share=True) | |