File size: 2,894 Bytes
d9a08f5
 
 
 
 
 
 
28fff8f
d9a08f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28fff8f
d9a08f5
28fff8f
 
 
 
d9a08f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab57d69
 
 
 
d9a08f5
ab57d69
d9a08f5
 
 
 
5d7d7e8
 
a03fb8e
d9a08f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab57d69
 
d9a08f5
5d7d7e8
ab57d69
d9a08f5
28fff8f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
import os
from huggingface_hub import InferenceClient
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup

hf_token = os.getenv("HF_Token")
client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=hf_token)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def preprocess_text(text):
    cleaned_text = text.strip()
    chunks = []
    sentences = cleaned_text.split("\n")
    for i in sentences:
        chunks.extend(i.split(". "))
        
    cleaned_chunks = []
    for chunk in chunks:
        chunk = chunk.strip()
        if len(chunk) > 0:
            cleaned_chunks.append(chunk)
    return cleaned_chunks

def prepare_docs():
    with open('spots.json', 'r') as f:
        raw_data = json.load(f)
    
    all_processed_chunks = []
    
    for item in raw_data:
        soup = BeautifulSoup(item['popup'], 'html.parser')
        name = soup.find(class_='infobox-title').get_text() if soup.find(class_='infobox-title') else "Unknown Spot"
        raw_html_text = soup.get_text(separator=" ")
        
        chunks = preprocess_text(raw_html_text)
        for chunk in chunks:
            all_processed_chunks.append(f"[{name}]: {chunk}")
            
    return all_processed_chunks

processed_data = prepare_docs()
embeddings = embed_model.encode(processed_data)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings).astype('float32'))

def retrieve(query, k=3):
    query_vec = embed_model.encode([query])
    distances, indices = index.search(np.array(query_vec).astype('float32'), k)
    return [processed_data[i] for i in indices[0]]

def respond(message, history):
    retrieved_info = retrieve(message)
    context = "\n- ".join(retrieved_info)
    
    system_prompt = f"""You are 'CityScout', a friendly guide to unique hangout spots.
    Use the following verified facts from our database to help the user.
    Always mention the name of the spot found in the brackets [Like This].
    
    Database Facts:
    - {context}
    
    If you find a match, describe it enthusiastically! If not, help them brainstorm based on their interests.""" 

    messages = [{"role": "system", "content": system_prompt}]
    
    for msg in history:
        messages.append(msg)

    messages.append({"role": "user", "content": message})

    response = ""
    for chunk in client.chat_completion(
        messages,
        max_tokens=500,
        temperature=0.7,
        top_p=0.9,
        stream=True
    ):
        token = chunk.choices[0].delta.content
        if token:
            response += token
            yield response

chatbot = gr.ChatInterface(
    respond,
    title="CityScout: Unique Spot Finder",
    description="Tell me your city or interests and I'll help you find cool places nearby!"
)

if __name__ == "__main__":
    chatbot.launch()