Spaces:
Build error
Build error
| import pandas as pd | |
| import json | |
| import os | |
| import nltk | |
| from typing import List, Dict, Any | |
| from datasets import load_dataset | |
| import nlpaug.augmenter.word as naw | |
| from deep_translator import GoogleTranslator # Updated import | |
| # Configure NLTK data path and download required resources | |
| NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data") | |
| os.makedirs(NLTK_DATA_PATH, exist_ok=True) | |
| nltk.data.path.append(NLTK_DATA_PATH) | |
| def ensure_nltk_resources(): | |
| """ | |
| Ensure NLTK resources are downloaded and available | |
| """ | |
| try: | |
| nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH) | |
| nltk.download('punkt', download_dir=NLTK_DATA_PATH) | |
| print(f"NLTK resources downloaded to {NLTK_DATA_PATH}") | |
| return True | |
| except Exception as e: | |
| print(f"Failed to download NLTK resources: {e}") | |
| return False | |
| def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]: | |
| """ | |
| Load FAQ data from Hugging Face datasets, cache locally | |
| """ | |
| local_path = "data/ecommerce_faqs.json" | |
| if os.path.exists(local_path): | |
| print(f"Loading cached dataset from {local_path}") | |
| with open(local_path, 'r') as f: | |
| return json.load(f) | |
| print(f"Loading dataset {dataset_name} from Hugging Face...") | |
| try: | |
| dataset = load_dataset(dataset_name) | |
| faqs = [{ | |
| "question": item["question"], | |
| "answer": item["answer"], | |
| "category": item.get("category", ""), | |
| "question_id": item.get("question_id", ""), | |
| "faq_url": item.get("faq_url", "") | |
| } for item in dataset["train"]] | |
| with open(local_path, 'w') as f: | |
| json.dump(faqs, f) | |
| print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs") | |
| return faqs | |
| except Exception as e: | |
| print(f"Error loading dataset: {e}") | |
| print("Falling back to local data...") | |
| return load_faq_data("data/faq_data.csv") | |
| def load_faq_data(file_path: str) -> List[Dict[str, Any]]: | |
| """ | |
| Load FAQ data from a local CSV or JSON file | |
| """ | |
| print(f"Loading data from {file_path}") | |
| try: | |
| if file_path.endswith('.csv'): | |
| df = pd.read_csv(file_path) | |
| faqs = df.to_dict('records') | |
| elif file_path.endswith('.json'): | |
| with open(file_path, 'r') as f: | |
| faqs = json.load(f) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_path}") | |
| print(f"Loaded {len(faqs)} FAQ entries") | |
| return faqs | |
| except Exception as e: | |
| print(f"Error loading data: {e}") | |
| print("Creating sample dataset as fallback") | |
| sample_faqs = [ | |
| {"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."}, | |
| {"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."} | |
| ] | |
| return sample_faqs | |
| def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """ | |
| Preprocess FAQ data: clean text, handle formatting, and filter invalid entries | |
| """ | |
| processed_faqs = [] | |
| for faq in faqs: | |
| # Safely handle question and answer fields | |
| question = faq.get('question') | |
| answer = faq.get('answer') | |
| # Convert to string and strip, handling None values | |
| question = str(question).strip() if question is not None else "" | |
| answer = str(answer).strip() if answer is not None else "" | |
| # Update FAQ dictionary | |
| faq['question'] = question | |
| faq['answer'] = answer | |
| # Only include FAQs with both question and answer | |
| if question and answer: | |
| processed_faqs.append(faq) | |
| else: | |
| print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'") | |
| print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries") | |
| return processed_faqs | |
| def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]: | |
| """ | |
| Augment FAQs with paraphrased questions if enabled | |
| """ | |
| if not enable_augmentation: | |
| print("Augmentation disabled; returning original FAQs") | |
| return faqs | |
| if not ensure_nltk_resources(): | |
| print("NLTK resources unavailable; skipping augmentation") | |
| return faqs | |
| aug = naw.SynonymAug() | |
| augmented = [] | |
| for faq in faqs: | |
| augmented.append(faq) | |
| if len(augmented) < max_faqs: | |
| try: | |
| aug_question = aug.augment(faq['question'])[0] | |
| augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")}) | |
| except Exception as e: | |
| print(f"Augmentation error for question '{faq['question'][:50]}...': {e}") | |
| print(f"Augmented to {len(augmented)} FAQs") | |
| return augmented | |
| def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]: | |
| """ | |
| Translate FAQ to a target language using deep-translator | |
| """ | |
| try: | |
| translator = GoogleTranslator(source='en', target=target_lang) | |
| translated = faq.copy() | |
| translated["question"] = translator.translate(faq["question"]) | |
| translated["answer"] = translator.translate(faq["answer"]) | |
| translated["language"] = target_lang | |
| return translated | |
| except Exception as e: | |
| print(f"Translation error: {e}") | |
| return faq | |
| # import pandas as pd | |
| # import json | |
| # import os | |
| # import nltk | |
| # from typing import List, Dict, Any | |
| # from datasets import load_dataset | |
| # import nlpaug.augmenter.word as naw | |
| # from googletrans import Translator | |
| # # Configure NLTK data path and download required resources | |
| # NLTK_DATA_PATH = os.path.join(os.path.dirname(__file__), "../nltk_data") | |
| # os.makedirs(NLTK_DATA_PATH, exist_ok=True) | |
| # nltk.data.path.append(NLTK_DATA_PATH) | |
| # def ensure_nltk_resources(): | |
| # """ | |
| # Ensure NLTK resources are downloaded and available | |
| # """ | |
| # try: | |
| # nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH) | |
| # nltk.download('punkt', download_dir=NLTK_DATA_PATH) | |
| # print(f"NLTK resources downloaded to {NLTK_DATA_PATH}") | |
| # return True | |
| # except Exception as e: | |
| # print(f"Failed to download NLTK resources: {e}") | |
| # return False | |
| # def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]: | |
| # """ | |
| # Load FAQ data from Hugging Face datasets, cache locally | |
| # """ | |
| # local_path = "data/ecommerce_faqs.json" | |
| # if os.path.exists(local_path): | |
| # print(f"Loading cached dataset from {local_path}") | |
| # with open(local_path, 'r') as f: | |
| # return json.load(f) | |
| # print(f"Loading dataset {dataset_name} from Hugging Face...") | |
| # try: | |
| # dataset = load_dataset(dataset_name) | |
| # faqs = [{ | |
| # "question": item["question"], | |
| # "answer": item["answer"], | |
| # "category": item.get("category", ""), | |
| # "question_id": item.get("question_id", ""), | |
| # "faq_url": item.get("faq_url", "") | |
| # } for item in dataset["train"]] | |
| # with open(local_path, 'w') as f: | |
| # json.dump(faqs, f) | |
| # print(f"Saved dataset to {local_path}, loaded {len(faqs)} FAQs") | |
| # return faqs | |
| # except Exception as e: | |
| # print(f"Error loading dataset: {e}") | |
| # print("Falling back to local data...") | |
| # return load_faq_data("data/faq_data.csv") | |
| # def load_faq_data(file_path: str) -> List[Dict[str, Any]]: | |
| # """ | |
| # Load FAQ data from a local CSV or JSON file | |
| # """ | |
| # print(f"Loading data from {file_path}") | |
| # try: | |
| # if file_path.endswith('.csv'): | |
| # df = pd.read_csv(file_path) | |
| # faqs = df.to_dict('records') | |
| # elif file_path.endswith('.json'): | |
| # with open(file_path, 'r') as f: | |
| # faqs = json.load(f) | |
| # else: | |
| # raise ValueError(f"Unsupported file format: {file_path}") | |
| # print(f"Loaded {len(faqs)} FAQ entries") | |
| # return faqs | |
| # except Exception as e: | |
| # print(f"Error loading data: {e}") | |
| # print("Creating sample dataset as fallback") | |
| # sample_faqs = [ | |
| # {"question": "How do I track my order?", "answer": "You can track your order by logging into your account and visiting the Order History section."}, | |
| # {"question": "How do I reset my password?", "answer": "To reset your password, click on the 'Forgot Password' link on the login page."} | |
| # ] | |
| # return sample_faqs | |
| # def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| # """ | |
| # Preprocess FAQ data: clean text, handle formatting, and filter invalid entries | |
| # """ | |
| # processed_faqs = [] | |
| # for faq in faqs: | |
| # # Safely handle question and answer fields | |
| # question = faq.get('question') | |
| # answer = faq.get('answer') | |
| # # Convert to string and strip, handling None values | |
| # question = str(question).strip() if question is not None else "" | |
| # answer = str(answer).strip() if answer is not None else "" | |
| # # Update FAQ dictionary | |
| # faq['question'] = question | |
| # faq['answer'] = answer | |
| # # Only include FAQs with both question and answer | |
| # if question and answer: | |
| # processed_faqs.append(faq) | |
| # else: | |
| # print(f"Skipping invalid FAQ: question='{question}', answer='{answer}'") | |
| # print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries") | |
| # return processed_faqs | |
| # def augment_faqs(faqs: List[Dict[str, Any]], max_faqs: int = 1000, enable_augmentation: bool = True) -> List[Dict[str, Any]]: | |
| # """ | |
| # Augment FAQs with paraphrased questions if enabled | |
| # """ | |
| # if not enable_augmentation: | |
| # print("Augmentation disabled; returning original FAQs") | |
| # return faqs | |
| # if not ensure_nltk_resources(): | |
| # print("NLTK resources unavailable; skipping augmentation") | |
| # return faqs | |
| # aug = naw.SynonymAug() | |
| # augmented = [] | |
| # for faq in faqs: | |
| # augmented.append(faq) | |
| # if len(augmented) < max_faqs: | |
| # try: | |
| # aug_question = aug.augment(faq['question'])[0] | |
| # augmented.append({"question": aug_question, "answer": faq['answer'], "category": faq.get("category", "")}) | |
| # except Exception as e: | |
| # print(f"Augmentation error for question '{faq['question'][:50]}...': {e}") | |
| # print(f"Augmented to {len(augmented)} FAQs") | |
| # return augmented | |
| # def translate_faq(faq: Dict[str, Any], target_lang: str = "es") -> Dict[str, Any]: | |
| # """ | |
| # Translate FAQ to a target language | |
| # """ | |
| # try: | |
| # translator = Translator() | |
| # translated = faq.copy() | |
| # translated["question"] = translator.translate(faq["question"], dest=target_lang).text | |
| # translated["answer"] = translator.translate(faq["answer"], dest=target_lang).text | |
| # translated["language"] = target_lang | |
| # return translated | |
| # except Exception as e: | |
| # print(f"Translation error: {e}") | |
| # return faq |