File size: 19,429 Bytes
60f48d0
28af803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60f48d0
28af803
 
 
 
60f48d0
28af803
 
 
 
 
60f48d0
28af803
 
60f48d0
28af803
 
 
 
 
60f48d0
28af803
 
 
 
 
 
 
 
 
 
 
60f48d0
28af803
 
60f48d0
 
28af803
1434c06
28af803
 
 
 
 
 
 
 
 
 
1434c06
60f48d0
28af803
 
 
1434c06
60f48d0
28af803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1434c06
28af803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1434c06
28af803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7e10ca
28af803
e7e10ca
 
 
 
 
 
 
 
 
 
28af803
e7e10ca
 
28af803
 
e7e10ca
 
28af803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7e10ca
28af803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1434c06
 
28af803
 
1434c06
28af803
1434c06
28af803
1434c06
28af803
1434c06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28af803
 
 
 
 
 
 
 
 
1434c06
28af803
 
 
1434c06
 
28af803
1434c06
 
 
 
 
 
 
28af803
1434c06
28af803
 
 
 
 
 
 
 
 
 
60f48d0
 
 
28af803
 
 
 
60f48d0
28af803
 
60f48d0
 
 
 
 
 
 
 
28af803
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
import gradio as gr
import pandas as pd
import numpy as np
import re
from transformers import pipeline, AutoTokenizer
from sentence_transformers import SentenceTransformer
import torch
import os
import spaces

# --- Global/Cached Variables ---
try:
    # --- Load Data and Embeddings ---
    sheet_id = "1hMsYgDQj3ymqwxUXA7R-ITITnw3HzeVZBxaXAjiJwAE"
    sheet_gids = {
        "Starting Point": "0",
        "Immediate Help": "1278392561",
        "Counselling": "713986636",
        "Child/Youth Counselling": "1265113400",
        "Parenting": "299805447",
        "Safe Housing": "1571281149",
        "Victim Rights Info": "1952909822",
        "Legal Rep": "958128700",
        "Legal Info": "1989315755",
        "Grief": "2127423570"
    }

    all_dfs = []
    for sheet_name, gid in sheet_gids.items():
        url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={gid}"
        try:
            df = pd.read_csv(url)
            df['Source_Sheet'] = sheet_name
            all_dfs.append(df)
        except Exception as e:
            print(f"Error reading {sheet_name}: {e}")

    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        combined_df['Combined Description'] = combined_df['Relevant crime/incident'].astype(str) + '; ' + combined_df['Description'].astype(str)
        print(f"DF COMBINED! {combined_df}")
    else:
        combined_df = pd.DataFrame()
        print("WARNING: Dataframe is empty.")

    # --- Load Embedding Model ---
    print("Loading Embedding Model...")
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Embedding Model LOADED!")     

    if not combined_df.empty:
        text_to_embed_description = combined_df['Combined Description'].fillna('').astype(str).tolist()
        embeddings_description = embedding_model.encode(text_to_embed_description)
        combined_df['embeddings_description'] = list(embeddings_description)
        print(f"DF UPDATED! {combined_df}")

    else:
        print("WARNING: Skipping embedding generation due to empty DataFrame.")

    HF_AUTH_TOKEN = os.environ.get("HF_TOKEN")
    
    print("Loading Llama Model...")
    model_name = "meta-llama/Llama-2-7b-chat-hf" 
    print(f"llama model: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=HF_AUTH_TOKEN)
    
    llm = pipeline(
        "text-generation",
        model=model_name,
        tokenizer=tokenizer,
        device_map="auto", 
        torch_dtype=torch.float16,
    )
    
    print("LLAMA loaded!!")

except Exception as e:
    print(f"FATAL ERROR during model or data loading: {e}")


# --- Constants ---
DESC_THRESHOLD = 0.2 # this is for initializing the conversation. If the start of the conversation doesn't meet this, then the chatbot's keep asking for more questions. 
FINAL_THRESHOLD = 0.4 # this is for filtering out the most relevant information. 
N_DESC = 20

# --- Global Chat Context State ---
# These will be updated within chatbot_loop and persist across calls
LAST_KNOWN_INTENT = None
LAST_KNOWN_CITY = None
desc_results_df = None

SYSTEM_PROMPT = """
YOU ARE A TRAUMA-INFORMED, COMMUNITY-CONNECTED SUPPORT AGENT DESIGNED TO ASSIST INDIVIDUALS EXPERIENCING GENDER-BASED VIOLENCE IN BRITISH COLUMBIA, CANADA. 
"""

SYSTEM_PROMPT_RAG = SYSTEM_PROMPT + """
\n
**RAG INSTRUCTIONS**: NEVER START YOUR RESPONSE WITH A GREETING. YOUR ONLY TASK IS TO PROVIDE A SUMMARY OF THE FOLLWING SERVICE INFORMATION, delimited by triple backticks (```), to formulate your response WITHIN 50 WORDS. MAINTAIN WARM ATTITUDE, BUT SUMMARY SHOULD BE IN 50 WORDS. **Do not mention the RAG process or the triple backticks in your final answer**.
"""

CITY_KEYWORDS = {
    "new west": "New Westminster",
    "new westminster": "New Westminster",
    "surrey": "Surrey",
    "vancouver": "Vancouver",
    "downtown vancouver": "Vancouver",
    "richmond": "Richmond",
    "north van": "North Vancouver",
    "north vancouver": "North Vancouver",
    "burnaby": "Burnaby",
    "west van": "West Vancouver",
    "west vancouver": "West Vancouver",
    "langley": "Langley",
    "coquitlam": "Tri-Cities (Port Moody, Coquitlam, Port Coquitlam)",
    "port moody": "Tri-Cities (Port Moody, Coquitlam, Port Coquitlam)",
    "port coquitlam": "Tri-Cities (Port Moody, Coquitlam, Port Coquitlam)",
}

VALID_CITY_CATEGORIES = [
    "New Westminster",
    "Surrey",
    "Vancouver",
    "Richmond",
    "North Vancouver",
    "Burnaby",
    "West Vancouver",
    "Langley",
    "Delta",
    "White Rock",
    "Tri-Cities (Port Moody, Coquitlam, Port Coquitlam)",
    "Other cities in BC, Canada",
]


# --- Core RAG Functions ---
def retrieve_with_pandas_description(query, top_k=N_DESC):
    print(f"I'm at retrieve_with_pandas_desc with {query}")

    if combined_df.empty:
        return pd.DataFrame()
    query_embedding = embedding_model.encode([query])[0]
    combined_df['similarity_desc'] = combined_df['embeddings_description'].apply(lambda x: np.dot(query_embedding, x) /
                                             (np.linalg.norm(query_embedding) * np.linalg.norm(x)))
    results = combined_df.sort_values(by="similarity_desc", ascending=False).head(top_k).copy()
    return results

def is_query_only_cities(query):
    # Normalize the query by removing common delimiters and whitespace
    normalized_query = re.sub(r'[,\s]+', ' ', query.lower()).strip()
    
    if not normalized_query:
        return False

    # Check if the normalized query is an exact match for one of the CITY_KEYWORDS keys
    if normalized_query in CITY_KEYWORDS:
        return True

    # Check if the query is composed entirely of city keywords separated by spaces
    # This handles "surrey burnaby" or "new west"
    
    # Check for multi-word city keywords first (e.g., "new westminster")
    for keyword in sorted(CITY_KEYWORDS.keys(), key=len, reverse=True):
        if keyword in normalized_query:
             # Remove the detected keyword from the string
            normalized_query = normalized_query.replace(keyword, '').strip()

    # After removing all city keywords, if the string is empty or contains only delimiters, 
    # it means the original query was only city names.
    if not normalized_query:
        return True
    
    # Fallback check (less rigorous, but helps)
    # Check if the remaining non-city parts contain any content words (excluding "and", "or", etc.)
    non_city_words = re.sub(r'\b(and|or|in)\b', '', normalized_query).strip()
    return not non_city_words

def remove_substrings_from_string(main_string, substrings_list):
    """
    Removes city names and optional preceding prepositions (like 'in', 'at', 'for')
    from the main string, case-insensitively, to isolate the intent.
    """
    cleaned_string = main_string
    print(f"[DEBUG] main_string '{cleaned_string}'")
    # 1. Define the prepositions we want to optionally remove
    prepositions = r'(?:\s*(?:in|at|for)\s+)?' # Matches optional ' in ', ' at ', ' for '
    
    # Use a set of canonical cities for efficiency
    canonical_cities = set(substrings_list)
    
    for canonical in canonical_cities: 
        # Find all keywords associated with this canonical city (e.g., 'new west', 'new westminster')
        keywords = [key for key, city in CITY_KEYWORDS.items() if city == canonical]
        
        # Sort keywords by length in descending order to match multi-word names first
        keywords.sort(key=len, reverse=True)
        
        for keyword in keywords:
            # Construct a robust regex pattern: [Prepositions]? [City Keyword]
            # The '(\s*|$)'' at the end handles cases where the city is at the end of the sentence
            pattern = rf'{prepositions}\b{re.escape(keyword)}\b(\s*|$)'
            
            # Use sub to replace the matched pattern (including the optional preposition) with a single space
            # flags=re.IGNORECASE ensures case-insensitive matching
            cleaned_string = re.sub(pattern, ' ', cleaned_string, flags=re.IGNORECASE)

    # 2. Clean up resulting extra spaces, commas, and strip leading/trailing whitespace
    cleaned_string = re.sub(r'[\s,]+', ' ', cleaned_string).strip()
    print(f"[DEBUG] cleaned_string '{cleaned_string}'")
    return cleaned_string

def get_df_filtered_by_desc(query):
    print(f"[DEBUG] get_df_filtered_by_desc with query: '{query}'")
    
    return retrieve_with_pandas_description(query, top_k=N_DESC)

def detect_city_from_query(query):
    print(f"detect_city_from_query with {query}")
    text = query.lower()
    detected = []
    for keyword, canonical in CITY_KEYWORDS.items():
        if re.search(rf"\b{re.escape(keyword.lower())}\b", text):
            detected.append(canonical)
    print(f"found city: {detected}")
    return detected

def get_df_filtered_by_general_city(city_context, desc_results_df):
    print(f"[STEP] I'm at get_df_filtered_by_general_city with query: {city_context}.")
    print(f"[DEBUG] As of now desc_df is = {desc_results_df}")

    pattern = r'\b(?:' + '|'.join(re.escape(c) for c in city_context) + r')\b'
    general_city_filtered_df = desc_results_df[desc_results_df["City"].str.contains(pattern, case=False, na=False, regex=True)]
    print(f"[DEBUG] FILTERED by GENERAL city! General_city_filtered_df is = {general_city_filtered_df}")
    
    return city_context, general_city_filtered_df

def get_df_filtered_by_service_city(city_context, general_city_filtered_df):
    print(f"[STEP] I'm at get_df_filtered_by_SERVICE_city with: '{city_context}'")

    pattern = r'\b(?:' + '|'.join(re.escape(c) for c in city_context) + r')\b'
    main_city_filtered_df = general_city_filtered_df[general_city_filtered_df["Main Service City"].str.contains(pattern, case=False, na=False, regex=True)]
    print(f"[DEBUG] FILTERED by MAIN city! Now, main_city_filtered_df is = {main_city_filtered_df} THIS WILL FEED THE FINAL RESULTS!!")
    
    return main_city_filtered_df if not main_city_filtered_df.empty else general_city_filtered_df

@spaces.GPU(duration=250)
def llm_generate_response(prompt):
    prompt_template = f"<s>[INST] {prompt} [/INST]"
    try:
        response = llm(
            prompt_template,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
        )
        generated_text = response[0]['generated_text']
        response_text = generated_text.split("[/INST]")[-1].strip()
        return response_text
    except Exception as e:
        print(f"LLM generation error: {e}")
        return "I encountered an error generating a response. Please try again."
    
    
def generate_resources(FINAL_FILTERED_DF):

    # Filter rows above threshold — NEW DATAFRAME
    FINAL_FILTERED_DF = FINAL_FILTERED_DF[
        FINAL_FILTERED_DF["similarity_desc"] >= FINAL_THRESHOLD
    ]

    if FINAL_FILTERED_DF.empty:
        return (
            "Unforunately, no highly relevant services were found. "
            "However, please check out the below resource." # Introduce VictimLinkBC after this. 
        )
    
    print(f"[STEP] I'm at generate_resources with df: {FINAL_FILTERED_DF}")
   
    context_list = []
    print(f"[STEP] I'm at generate_resources with df. Creating context_list!")
    
    for _, row in FINAL_FILTERED_DF.iterrows():
        
        phone = row['Phone #']
        email = row['Email']
        website = row['Website']
        
        # Use pd.isna() or np.isnan() to check for missing values (NaN)
        # If the value is missing, use 'N/A', otherwise use the value.
        phone_val = 'N/A' if pd.isna(phone) else phone
        email_val = 'N/A' if pd.isna(email) else email
        website_val = 'N/A' if pd.isna(website) else website
        
        context_entry = (
            f"Organization Name: {row['Title']}\n"
            f"{row['Description']}\n"
            f"📞: {phone_val}, 📧: {email_val}, 🌐: {website_val}\n"
            "------------------------------------------------------------"
        )
        context_list.append(context_entry)
        print(f"[STEP] I'm at generate_resources with CONTEXT_LIST []: {context_entry}")
        
    return "\n\n".join(context_list)

    
def chatbot_loop(query, history):
    global LAST_KNOWN_INTENT, LAST_KNOWN_CITY
    global desc_results_df

    city_context = detect_city_from_query(query)
    is_first_interaction = not history
    
    if is_query_only_cities(query):
        # the query itself indicates a city.
        print(f"query is a city name itself ('{query}') ")
        
        if LAST_KNOWN_INTENT and city_context:
            # LAST_KNOWN_CITY's been saved in the past. User previously gave intent, now they gave the city.
            LAST_KNOWN_CITY = city_context
            print(f"[CITY ONLY] We've LAST_KNOWN_INTENT: '{LAST_KNOWN_INTENT}'. New city input: '{LAST_KNOWN_CITY}' as city_context.")
        else:
            # User gave city first, or gave city again. Ask for intent.
            LAST_KNOWN_CITY = query
            print(f"[CITY ONLY] INTENT's !!!NOT!!! been saved in the past. Saving the query: '{LAST_KNOWN_CITY}' as LAST_KNOWN_CITY.")
            
            if is_first_interaction:
                return (
                "Hello, I am happy that you have found me. My name is One Tap Away, designed for gender-based violence support services resources."
                "\n Please note that my answer is only restricted to Metro Vancouver, BC!"
                "\n Thank you for letting me know the city. Which areas do you need help with? For example: counselling, safe housing, or legal information?"
                    )
            else:
                return "Thank you for letting me know the city. Which areas do you need help with? For example: counselling, safe housing, or legal information?" 
    
    elif not city_context:
        # the input's not city i.e. the query itself indicates an intent. 
        print(f"query is an intent itself ('{query}') ")
        
        if LAST_KNOWN_CITY:
            # LAST_KNOWN_CITY's been saved in the past.
            LAST_KNOWN_INTENT = query 
            print(f"[INTENT ONLY] City's been given in the past: '{LAST_KNOWN_CITY}', now the user's giving the intent! Saving the query: '{LAST_KNOWN_INTENT}' as INTENT.")
        
        else:
            # LAST_KNOWN_CITY's !!!NOT!! saved in the past.  
            LAST_KNOWN_INTENT = query
            print(f"[INTENT ONLY] no city_context with: {query}. Saving the query: '{LAST_KNOWN_INTENT}' as INTENT. This is potentially when the user just provided help areas without city info.")
            
            if is_first_interaction:
                return (
                "Hello, I am happy that you have found me. My name is One Tap Away, designed for gender-based violence support services resources."
                "\n Please note that my answer is only restricted to Metro Vancouver, BC!"
                "\n I would appreciate more details for your inquiries. Which city are you looking for services in? Vancouver, Surrey, Burnaby, Richmond, Langley, Coquitlam, Port Moody, Port Coquitlam, West Vancouver, North Vancouver, White Rock, Delta, Others?"
                    )
            else:
                return "Which city are you looking for services in? Vancouver, Surrey, Burnaby, Richmond, Langley, Coquitlam, Port Moody, Port Coquitlam, West Vancouver, North Vancouver, White Rock, Delta, Others?" 
    else: 
        # user has provided both CITY_CONTEXT and INTENT at the same time. 
        print(f"[CITY & INTENT] City AND intent detected in one input!")
        LAST_KNOWN_CITY = city_context
        print(f"LAST_KNOWN_CITY IS SET!: '{LAST_KNOWN_CITY}'")
        LAST_KNOWN_INTENT = query
        print(f"LAST_KNOWN_INTENT IS SET! '{LAST_KNOWN_INTENT}'")
     
    # FINALLY! We got both city and intent.  
    if LAST_KNOWN_CITY and LAST_KNOWN_INTENT:
        LAST_KNOWN_INTENT = remove_substrings_from_string(LAST_KNOWN_INTENT, LAST_KNOWN_CITY) # if there's city info in the LAST_KNOWN_INTENT, then remove it.
        print(f"[CITY & INTENT] FINALLY we got LAST_KNOWN_CITY: '{LAST_KNOWN_CITY}' AND LAST_KNOWN_INTENT: '{LAST_KNOWN_INTENT}' ")
        
        desc_results_df = get_df_filtered_by_desc(LAST_KNOWN_INTENT)
        # RETRIEVAL: Print similarity scores
        print("--- Stage 1: Description Similarity Scores ---")
        print(desc_results_df[['Title', 'similarity_desc']].head())
        print(f"Max Similarity: {desc_results_df['similarity_desc'].max():.4f}")
        print(f"Min Similarity: {desc_results_df['similarity_desc'].min():.4f}\n")
        
        
        if desc_results_df.empty or desc_results_df.get('similarity_desc', pd.Series([-1])).max() < DESC_THRESHOLD:
            if is_first_interaction:
                return "Hello, I am happy that you have found me. My name is One Tap Away, designed for gender-based violence support services resources. \n Please note that my answer is only restricted to Metro Vancouver, BC! \n\n Could you please explain to me more about the area you need help with?"
            else:
                return "I am sorry, could you please elaborate more on what areas you'd like help with?"

        detected_cities, general_city_df = get_df_filtered_by_general_city(LAST_KNOWN_CITY, desc_results_df)
        final_df = get_df_filtered_by_service_city(detected_cities, general_city_df)

        final_statement = "Thank you for the information. Here is some relevant information for you:\n\n"
        final_output = generate_resources(final_df)
        final_resource = "\n\n I would like to highlight that VictimLink BC could be a good start for you. \n VictimLink BC is a toll-free, confidential and multilingual services available across B.C. and the Yukon. VictimLinkBC provides information and referral services to call victims of crime and immediate crisis support to victims of family and sexual violence, victims of human trafficking and sexual services. \n https://victimlinkbc.ca/ \n 1-800-563-0808 \n 211-victimlinkbc@uwbc.ca "
        
        LAST_KNOWN_INTENT = None
        LAST_KNOWN_CITY = None
        print(f"Wiping intent & city. Intent: '{LAST_KNOWN_INTENT}' City: '{LAST_KNOWN_CITY}'")

        return final_statement + final_output + final_resource # used to be llm_generate_response(combined_prompt)


def respond(message, history, system_message, max_tokens, temperature, top_p):
    
    llm_response = chatbot_loop(message, history)
    
    # 2. Yield the complete string once, and let the function end.
    yield llm_response
    # DO NOT use `return llm_response` after the yield.

chatbot = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a trauma-informed support agent for GBV in BC.", label="System message", visible=False),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens", visible=False),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature", visible=False),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p", visible=False),
    ],
    title="One Tap Away Chatbot",
    theme="soft",
)

with gr.Blocks() as demo:
    with gr.Sidebar():
        gr.LoginButton()
    chatbot.render()

if __name__ == "__main__":
    demo.launch(share=True)