File size: 3,369 Bytes
26d1a81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
import json
from typing import List, Dict, Any
from datasets import load_dataset

def load_huggingface_faq_data(dataset_name: str = "NebulaByte/E-Commerce_FAQs") -> List[Dict[str, Any]]:
    """
    Load FAQ data from Hugging Face datasets
    """
    print(f"Loading dataset {dataset_name} from Hugging Face...")
    
    try:
        # Load the dataset
        dataset = load_dataset(dataset_name)
        
        # Get the train split (as seen in the screenshots)
        train_data = dataset["train"]
        
        # Convert to list of dictionaries
        faqs = []
        for item in train_data:
            # Extract the required fields
            faq = {
                "question": item["question"],
                "answer": item["answer"],
                # Include additional metadata
                "category": item.get("category", ""),
                "question_id": item.get("question_id", ""),
                "faq_url": item.get("faq_url", "")
            }
            faqs.append(faq)
        
        print(f"Loaded {len(faqs)} FAQ entries from Hugging Face")
        return faqs
    
    except Exception as e:
        print(f"Error loading dataset from Hugging Face: {e}")
        print("Falling back to local data...")
        return load_faq_data("data/faq_data.csv")

def load_faq_data(file_path: str) -> List[Dict[str, Any]]:
    """
    Load FAQ data from a local CSV or JSON file
    """
    print(f"Loading data from {file_path}")
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
            # Assume CSV has 'question' and 'answer' columns
            faqs = df.to_dict('records')
        elif file_path.endswith('.json'):
            with open(file_path, 'r') as f:
                faqs = json.load(f)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
        
        print(f"Loaded {len(faqs)} FAQ entries")
        return faqs
    except Exception as e:
        print(f"Error loading data: {e}")
        # Create a minimal sample dataset as fallback
        print("Creating sample dataset as fallback")
        sample_faqs = [
            {"question": "How do I track my order?", 
             "answer": "You can track your order by logging into your account and visiting the Order History section."},
            {"question": "How do I reset my password?", 
             "answer": "To reset your password, click on the 'Forgot Password' link on the login page."}
        ]
        return sample_faqs

def preprocess_faq(faqs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Preprocess FAQ data: clean text, handle formatting
    """
    processed_faqs = []
    for faq in faqs:
        # Basic cleaning - remove extra whitespace
        if 'question' in faq and faq['question'] is not None:
            faq['question'] = faq['question'].strip()
        else:
            faq['question'] = ""
            
        if 'answer' in faq and faq['answer'] is not None:
            faq['answer'] = faq['answer'].strip()
        else:
            faq['answer'] = ""
        
        # Only include FAQs with both question and answer
        if faq.get('question') and faq.get('answer'):
            processed_faqs.append(faq)
    
    print(f"After preprocessing: {len(processed_faqs)} valid FAQ entries")
    return processed_faqs