File size: 6,831 Bytes
2c13490
 
 
 
 
 
 
9e67fa8
ba529a2
2c13490
9e67fa8
2c13490
ba529a2
 
9e67fa8
2c13490
 
 
 
4d8e90f
9e67fa8
ba529a2
4d8e90f
2c13490
 
 
9e67fa8
2c13490
 
 
 
 
ba529a2
2c13490
 
 
ba529a2
2c13490
 
ba529a2
2c13490
 
 
 
9e67fa8
2c13490
 
 
ba529a2
 
 
2c13490
 
4d8e90f
 
 
 
 
 
 
 
 
ba529a2
2c13490
 
 
4d8e90f
ba529a2
 
 
4d8e90f
 
 
 
2c13490
 
9e67fa8
2c13490
 
 
 
 
 
ba529a2
 
2c13490
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba529a2
 
2c13490
 
66518ca
9e67fa8
2c13490
9e67fa8
4d8e90f
9e67fa8
2c13490
 
 
 
ba529a2
 
2c13490
 
 
4d8e90f
 
66518ca
ba529a2
 
4d8e90f
 
2c13490
9e67fa8
ba529a2
2c13490
 
 
 
9e67fa8
2c13490
 
 
 
 
 
 
 
 
 
33f0e60
 
ba529a2
6a96817
ba529a2
6a96817
2c13490
 
 
 
 
 
cde5cd2
 
 
 
fa77263
cde5cd2
0844908
752b8e9
cde5cd2
 
4d8e90f
2c13490
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import os
import glob
import numpy as np

import gradio as gr
from sentence_transformers import SentenceTransformer

# here we define our basic setup. we decided to store all nutrition text files inside a folder called data
# this lets us update content later just by dropping new .txt files without changing any code
DATA_DIR = "data"
TOP_K = 3

# when the app starts, we load the embedding model once
# we chose this model because it works well on cpu and still gives good semantic retrieval
print("loading embedding model...")
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


def load_corpus_and_chunks(data_dir: str):
    """
    here we load all text files, read their content, and split them into meaningful chunks
    we use paragraph-level chunks so the answers have enough context without being too long
    """
    texts = []
    file_paths = glob.glob(os.path.join(data_dir, "*.txt"))

    print(f"found {len(file_paths)} files in {data_dir}")
    for path in file_paths:
        try:
            with open(path, "r", encoding="utf-8") as f:
                content = f.read()
        except UnicodeDecodeError:
            # some exported files use weird encodings, so we added this fallback during debugging
            with open(path, "r", encoding="latin-1") as f:
                content = f.read()

        # we decided to split on double newlines because most of our nutrition sources are written in short sections
        for chunk in content.split("\n\n"):
            chunk = chunk.strip()
            # we ignore very short chunks because they rarely help with retrieval
            if len(chunk) < 100:
                continue
            texts.append(chunk)

    print(f"total chunks: {len(texts)}")
    return texts


# here we load all corpus chunks and precompute their embeddings
# during deployment we discovered that if data is empty, normalizing the embeddings crashes the app
# so we added this guard and a safe no-knowledge mode
corpus_chunks = load_corpus_and_chunks(DATA_DIR)

if len(corpus_chunks) > 0:
    corpus_embeddings = embed_model.encode(
        corpus_chunks, convert_to_numpy=True, show_progress_bar=True
    )
    corpus_embeddings = corpus_embeddings / np.linalg.norm(
        corpus_embeddings, axis=1, keepdims=True
    )
else:
    corpus_embeddings = None
    print("warning: no documents found. nutribud will run in no-knowledge mode.")


def retrieve_relevant_chunks(question: str, k: int = TOP_K):
    """
    this is our rag retrieval step
    we embed the user question, compare it with all document chunks, and return the top k matches
    if there is no data, we simply return an empty list and explain that to the user
    """
    if corpus_embeddings is None or len(corpus_chunks) == 0:
        return []

    q_emb = embed_model.encode([question], convert_to_numpy=True)[0]
    q_emb = q_emb / np.linalg.norm(q_emb)

    scores = np.dot(corpus_embeddings, q_emb)
    top_indices = np.argsort(scores)[::-1][:k]
    results = [corpus_chunks[i] for i in top_indices]
    return results


# we added this safety filter so nutribud does not act like a doctor or give risky advice
# we focused on obvious high-risk keywords that showed up in our brainstorming (rapid weight loss, diabetes, etc.)
def is_high_risk_question(question: str) -> bool:
    q = question.lower()

    risky_keywords = [
        "exact calories",
        "calorie meal plan",
        "meal plan",
        "lose 20 pounds",
        "lose 10 pounds",
        "rapid weight loss",
        "crash diet",
        "diabetes",
        "diabetic",
        "blood sugar",
        "keto",
        "intermittent fasting",
        "dizzy",
        "faint",
        "fainting",
        "lightheaded",
        "eating disorder",
        "anorexia",
        "bulimia",
    ]

    return any(word in q for word in risky_keywords)


# this is the message we show if the safety check triggers
# we wrote it to be kind but firm about nutribud’s limits and to redirect users to real professionals
def safety_response(question: str) -> str:
    return (
        "i’m NutriBud, a general nutrition helper based on public health guidelines. "
        "i can’t give medical advice, personalized meal plans, or recommendations for specific "
        "conditions like diabetes, dizziness with fasting, or rapid weight loss. "
        "it’s really important to talk to a doctor or a registered dietitian for guidance "
        "that is safe for your health.\n\n"
        "if you’d like, you can ask me more general questions about healthy eating patterns, "
        "like ways to eat more vegetables, choose healthier drinks, or limit highly processed foods."
    )


# this function builds the final rag-based answer
# we stitch together an intro plus the most relevant chunks, and we trim if the answer gets too long
def build_rag_answer(question: str) -> str:
    contexts = retrieve_relevant_chunks(question, k=TOP_K)

    if not contexts:
        return (
            "right now, NutriBud doesn’t have any nutrition documents loaded.\n\n"
            "on the backend, we look up answers inside .txt files stored in the data folder of this space. "
            "to enable full answers, please add trusted nutrition documents there and restart the app."
        )

    intro = (
        "here’s a general answer based on the nutrition sources we loaded "
        "(for example canada’s food guide and similar public health material):\n\n"
    )

    body = "\n\n".join(contexts)
    full_text = intro + body

    max_len = 1200
    if len(full_text) > max_len:
        truncated = full_text[:max_len]
        if "." in truncated:
            truncated = truncated.rsplit(".", 1)[0] + "."
        full_text = truncated

    return full_text


# this is the core chat function. gradio will give us the message and the current chat history
# in our design, we ignore the history for retrieval because we focus on single-turn questions
def nutri_chat(message: str, history: list) -> str:
    if not message or not message.strip():
        return "please type a question about healthy eating to chat with nutribud."

    if is_high_risk_question(message):
        return safety_response(message)

    return build_rag_answer(message)


# here we use gradio's chatinterface so the layout feels like a normal chat app
# our earlier attempts to style themes directly caused version errors, so we keep it simple here
demo = gr.ChatInterface(
    fn=nutri_chat,
    title="🌿 NutriBud: Friendly Nutrition RAG Chatbot 🌿",
    description=(
        "Ask NutriBud questions about healthy eating and it will answer using trusted public health documents.\n"
        "Nutribud does not give medical advice or personalized meal plans."
    ),
)

if __name__ == "__main__":
    demo.launch()