Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import AutoTokenizer, AutoModel | |
| import pandas as pd | |
| from time import time | |
| import numpy as np | |
| from src.A_Preprocess import clean_text | |
| from src.E_Summarization import simple_summarize_text #, summarize_text | |
| from src.E_Model_utils import get_transformes_embeddings, load_model, get_embeddings | |
| from src.E_Faiss_utils import load_faiss_index, normalize_embeddings | |
| import warnings | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| st.header('Watson Assistant VDF TOBi improvement') | |
| st.write('The model is trained on the TOBi 🤖 intents in Romanian language.') | |
| '---' | |
| #st.write('🤖') | |
| #:robot_face: | |
| model_name = st.sidebar.radio("Selectează modelul 👇", ["MiniLM-L12-v2","llama3.2-1b","all-MiniLM-L6-v2","bert-base-romanian-cased-v1","multilingual-e5-small","e5_small_fine_tuned_model","all-distilroberta-v1"]) | |
| # Load the saved embeddings | |
| #model_name = "xlm-roberta-base" # Choose the desired model | |
| #model_name = "xlm-r-distilroberta-base-paraphrase-v1" | |
| #'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' | |
| # Model path | |
| # Load the trained model | |
| if model_name: | |
| if model_name == "bert-base-romanian-cased-v1": | |
| transformer_model_name = "dumitrescustefan/bert-base-romanian-cased-v1" | |
| if model_name == "llama3.2-1b": | |
| infloat_model_name = "AlexHung29629/sgpt-llama3.2-1b-stage1" | |
| if model_name == "MiniLM-L12-v2": | |
| infloat_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
| model_name = "paraphrase-multilingual-MiniLM-L12-v2" | |
| if model_name == "multilingual-e5-small": | |
| infloat_model_name = "intfloat/multilingual-e5-small" | |
| elif model_name == "e5_small_fine_tuned_model": | |
| infloat_model_name = r"output\fine-tuned-model" | |
| local_only = "local_files_only = True" | |
| elif model_name == "all-MiniLM-L6-v2": | |
| infloat_model_name = "sentence-transformers/all-MiniLM-L6-v2" | |
| elif model_name == "all-distilroberta-v1": | |
| infloat_model_name = "sentence-transformers/all-distilroberta-v1" | |
| else: | |
| st.write("Choose a model") | |
| st.write(f"Model **{model_name}** loaded successfully!") | |
| # Load the embeddings and the index on button push | |
| if 'index_loaded' not in st.session_state: | |
| st.session_state.index_loaded = False | |
| if 'index' not in st.session_state: | |
| st.session_state.index = None | |
| if 'pdf_button_enabled' not in st.session_state: | |
| st.session_state.pdf_button_enabled = False | |
| if 'data' not in st.session_state: | |
| st.session_state.data = None | |
| if 'intent_button_clicked' not in st.session_state: | |
| st.session_state.intent_button_clicked = False | |
| if 'intent' not in st.session_state: | |
| st.session_state.intent = None | |
| if 'similarity' not in st.session_state: | |
| st.session_state.similarity = None | |
| if 'model' not in st.session_state: | |
| st.session_state.model = None | |
| if 'summar_model' not in st.session_state: | |
| st.session_state.summar_model = None | |
| if 'summarized_text' not in st.session_state: | |
| st.session_state.summarized_text = None | |
| if 'csv_copied' not in st.session_state: | |
| st.session_state.csv_copied = False | |
| if 'csv_file_path' not in st.session_state: | |
| st.session_state.csv_file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv' | |
| if 'copied_csv_file_path' not in st.session_state: | |
| st.session_state.copied_csv_file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned_Copy.csv' | |
| if 'user_text' not in st.session_state: | |
| st.session_state.user_text = "" | |
| if 'user_utterance_updated' not in st.session_state: | |
| st.session_state.user_utterance_updated = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\User_utterances_updated.csv' | |
| # Function to create a copy of the CSV file | |
| def create_csv_copy(): | |
| df = pd.read_csv(st.session_state.csv_file_path) | |
| df.to_csv(st.session_state.copied_csv_file_path, index=False) | |
| st.session_state.csv_copied = True | |
| st.success("CSV file copied successfully.") | |
| # Function to add user text and intent to the copied CSV file | |
| def add_user_text_and_intent(): | |
| if st.session_state.csv_copied: | |
| df = pd.read_csv(st.session_state.copied_csv_file_path) | |
| new_row = {'utterance': st.session_state.user_text, 'intent': st.session_state.intent, 'similarity': st.session_state.similarity} | |
| st.write(new_row) | |
| df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) | |
| csv_file_path = f'{st.session_state.copied_csv_file_path}' | |
| df.to_csv(csv_file_path, index=False) | |
| st.success("User text and intent added to the copied CSV file successfully.") | |
| # First button: Load Embeddings and Index | |
| if st.button("Load Embeddings and Index"): | |
| if model_name == "e5_small_fine_tuned_model": | |
| model = SentenceTransformer(r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\src\output\fine-tuned-model\e5_small_fine_tuned_model', local_files_only = True) | |
| # Vocab Size | |
| vocab_size = model.tokenizer.vocab_size | |
| st.write(f"**Vocab Size:** {vocab_size}") | |
| # Max Sequence Length | |
| max_len = model.max_seq_length | |
| st.write(f"**Max Sequence Length:** {max_len}") | |
| st.session_state.model = model | |
| elif model_name == "bert-base-romanian-cased-v1": | |
| tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1") | |
| model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1") | |
| st.session_state.model = model | |
| else: | |
| model = SentenceTransformer(infloat_model_name) | |
| st.session_state.model = model | |
| index = load_faiss_index(f"embeddings/{model_name}_vector_db.index") | |
| st.session_state.index = index | |
| st.session_state.index_loaded = True | |
| st.write("Embeddings and index loaded successfully!") | |
| # File uploader: Only available after the second button is clicked | |
| if st.session_state.index_loaded == True: | |
| '-------------------' | |
| st.write(f'✨ Load the csv file?') | |
| uploaded_file = st.file_uploader("Search the csv file", type="csv") | |
| if uploaded_file is not None: | |
| st.session_state.data = pd.read_csv(uploaded_file) | |
| st.write("CSV file successfully uploaded!") | |
| st.write(st.session_state.data) # Display uploaded data | |
| # If file is already uploaded, maintain it in session state | |
| elif st.session_state.data is not None: | |
| st.write("Previously uploaded data:") | |
| st.write(st.session_state.data[:5]) # Display first 5 rows of uploaded data | |
| # If data is loaded, allow user to input text and identify intent | |
| data = st.session_state.data | |
| ... | |
| if st.session_state.data is not None: | |
| #ask for user input text - in english | |
| '-------------------' | |
| user_text = st.text_area("👇 Enter user utterance text:", placeholder= 'User text') | |
| st.write(f'Text length: {len(user_text)}') | |
| # Step 5: Process the text if it's provided | |
| if user_text: | |
| if len(user_text) > 150: | |
| st.write("The text is too long. Please summarize it.") | |
| summarize_button = st.button("Summarize") | |
| if summarize_button: | |
| st.session_state.summarized_text = simple_summarize_text(user_text) | |
| user_text = st.session_state.summarized_text | |
| st.write(f"The summarized text: {user_text}") | |
| # Store the user text in session state | |
| st.session_state.user_text = user_text | |
| start = time() | |
| # Clean the user input text | |
| cleaned_text = clean_text(user_text) | |
| # Get embeddings for the cleaned text using the loaded model | |
| model = st.session_state.model | |
| if model_name == "bert-base-romanian-cased-v1": | |
| tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1") | |
| model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1") | |
| input_embedding = get_transformes_embeddings([cleaned_text], model, tokenizer) | |
| else: | |
| input_embedding = get_embeddings(model, [cleaned_text]) | |
| # Normalize the embedding | |
| normalized_embedding = normalize_embeddings(input_embedding) | |
| # Store the embedding in session state | |
| st.session_state.input_embedding = normalized_embedding | |
| st.session_state.cleaned_text = cleaned_text | |
| # Display "Identifică Intenția" button | |
| intent_button = st.button("Calculate Intent and Similarity") | |
| # Store whether the button was clicked | |
| if intent_button: | |
| st.session_state.intent_button_clicked = True | |
| # Step 6: If the intent button is clicked, find the closest intent using FAISS | |
| if st.session_state.intent_button_clicked and st.session_state.input_embedding is not None: | |
| start = time() | |
| # Perform a search using FAISS to find the closest embedding match | |
| index = st.session_state.index | |
| D, I = index.search(st.session_state.input_embedding, 1) # Searching for the closest neighbor | |
| intents = st.session_state.data['intent'].tolist() | |
| intent = intents[I[0][0]] # Fetch the most similar intent | |
| distance = D[0][0] | |
| similarity = 1 / (1 + distance) # Calculate similarity from distance | |
| # Store intent and similarity in session state to persist results | |
| st.session_state.intent = intent | |
| st.session_state.similarity = similarity | |
| # Display the results | |
| st.write(f"Intent: {intent}") | |
| st.write(f"Confidence: {similarity:.4f}") | |
| st.write(f"Timp de răspuns: {time() - start:.4f} secunde") | |
| # Button to confirm adding user text and intent to the copied CSV file | |
| '-------------------' | |
| st.write(f'✨ Correct Intent: **{intent}**?') | |
| if st.button("Append User Text and Intent"): | |
| create_csv_copy() | |
| add_user_text_and_intent() | |
| '-------------------' | |
| if 'utt_csv_file' not in st.session_state: | |
| st.session_state.utt_csv_file = None | |
| if 'utt_intent_results_df' not in st.session_state: | |
| st.session_state.utt_intent_results_df = None | |
| if 'utt_csv_file_df' not in st.session_state: | |
| st.session_state.utt_csv_file_df = None | |
| # Function to perform similarity/intent search on a CSV file | |
| def apply_similarity_search(df): | |
| # Load the CSV file | |
| #display only the utterance and intent columns | |
| #display_loaded_df = df[['utterance','intent']] | |
| #st.write(display_loaded_df) | |
| # Check if 'utterance' column exists | |
| if 'utterance' not in df.columns: | |
| raise KeyError("The column 'utterance' does not exist in the DataFrame.") | |
| # Generate embeddings for each utterance | |
| utterances = df['utterance'].tolist() | |
| embeddings = st.session_state.model.encode(utterances) | |
| embeddings = np.array(embeddings).astype('float32') | |
| # Perform similarity search for each embedding | |
| intents = st.session_state.data['intent'].tolist() | |
| for i, embedding in enumerate(embeddings): | |
| D, I = st.session_state.index.search(np.expand_dims(embedding, axis=0), 1) | |
| intent = intents[I[0][0]] | |
| df.at[i, 'intent'] = intent | |
| # Save the updated DataFrame back to the CSV file | |
| csv_file_name = st.session_state.utt_csv_file.name | |
| df.to_csv(f'Updated_{csv_file_name}', index=False) | |
| return df | |
| # First button: Load utterance file | |
| if st.session_state.similarity and st.session_state.utt_csv_file is None: | |
| st.header('✨ Auto-update the utterances list without intent') | |
| csv_file = st.file_uploader("Load User utterances file", type="csv") | |
| if csv_file is not None: | |
| st.session_state.utt_csv_file = csv_file | |
| # Load the CSV file | |
| df = pd.read_csv(csv_file, encoding='windows-1252') | |
| st.session_state.utt_csv_file_df = df | |
| #display only the utterance and intent columns | |
| display_df = df[['utterance','intent']] | |
| st.write(display_df) | |
| st.success("Utterance file loaded successfully.") | |
| elif st.session_state.similarity and st.session_state.utt_csv_file_df is not None: | |
| st.write("Utterance file already loaded.") | |
| df = st.session_state.utt_csv_file_df | |
| #display only the utterance and intent columns | |
| display_df = df[['utterance','intent']] | |
| st.write(display_df) | |
| # Second button: Apply Similarity Search to CSV | |
| if st.session_state.utt_csv_file is not None and st.button("Apply Similarity Search to CSV"): | |
| st.write("Performing similarity search on the uploaded CSV file...") | |
| df = st.session_state.utt_csv_file_df | |
| results_df = apply_similarity_search(df) | |
| st.session_state.utt_intent_results_df = results_df | |
| #st.write("Results:") | |
| #st.dataframe(results_df.head()) | |
| # Display the results if available | |
| if st.session_state.utt_intent_results_df is not None: | |
| st.write("Results:") | |
| df = st.session_state.utt_intent_results_df | |
| #display only the utterance and intent columns | |
| display_results_df = df[['utterance','intent']] | |
| st.write(display_results_df) | |
| st.write(f"Timp de răspuns: {time() - start:.4f} secunde") | |
| # Optional: Display previous results if the process was already done | |
| #if st.session_state.intent is not None: | |
| # st.write(f"Intenția identificată anterior: {st.session_state.intent}") | |
| # st.write(f"Nivel de încredere anterior: {st.session_state.similarity:.4f}") | |
| # Stop the Streamlit app | |
| st.stop() | |