| | import re |
| | import numpy as np |
| | import nltk |
| | from nltk.tokenize import word_tokenize |
| | from nltk.corpus import stopwords |
| | from nltk.stem import WordNetLemmatizer |
| | from sentence_transformers import SentenceTransformer |
| | from PIL import Image |
| |
|
| | nltk.download('stopwords') |
| | nltk.download('punkt') |
| | nltk.download('wordnet') |
| | stop_words = set(stopwords.words('english')) |
| | lemmatizer = WordNetLemmatizer() |
| | model = SentenceTransformer('all-mpnet-base-v2') |
| | clip_model = SentenceTransformer('clip-ViT-B-32') |
| |
|
| | def clean_text(text): |
| | |
| | text = text.lower() |
| | |
| | text = re.sub(r'[^a-z\s]', '', text) |
| | |
| | words = word_tokenize(text) |
| | |
| | words = [lemmatizer.lemmatize(word) |
| | for word in words if word not in stop_words] |
| | |
| | cleaned_text = ' '.join(words) |
| | return cleaned_text |
| |
|
| | def extract_order_id_from_query(text): |
| | match = re.search(r'\bB-\d+\b', text) |
| | if match: |
| | return match.group(0) |
| | return None |
| |
|
| | def generate_text_embedding(text): |
| | try: |
| | text_embedding = clip_model.encode(text, convert_to_tensor=True) |
| | return text_embedding.cpu().numpy() |
| | except Exception as e: |
| | print(f"Error processing text '{text}': {e}") |
| | return np.zeros((512,)) |
| | |
| | def generate_image_embedding(image_path): |
| | try: |
| | image = Image.open(image_path) |
| | image = image.convert('RGB') |
| | image_embedding = clip_model.encode(image, convert_to_tensor=True) |
| | return image_embedding.cpu().numpy() |
| | except Exception as e: |
| | print(f"Error processing image from {image_path}: {e}") |
| | return np.zeros((512,)) |
| |
|
| | def clear_chat(): |
| | return [] |
| |
|
| |
|
| | def undo_last_message(chatbot): |
| | if chatbot: |
| | chatbot.pop() |
| | return chatbot |