HindHammad commited on
Commit
2c13490
·
1 Parent(s): 5c2a339

added app.py, requirements, and cleaned code

Browse files
Files changed (3) hide show
  1. .gitignore +3 -0
  2. app.py +187 -0
  3. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ venv
2
+ data
3
+ README.md
app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import numpy as np
4
+
5
+ import gradio as gr
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ # At the top, we set up some basic configuration for our RAG system.
9
+ # We decided to keep all our trusted nutrition documents as plain .txt files inside a folder called "data".
10
+ # That way, if we want to update NutriBud later, we can just drop more files into that folder without touching the code.
11
+ DATA_DIR = "data"
12
+ TOP_K = 3 # this controls how many chunks we retrieve for each question
13
+
14
+ # Here we load the sentence embedding model that we are using for retrieval.
15
+ # We chose "all-MiniLM-L6-v2" because it is light enough to run on CPU but still gives good-quality embeddings.
16
+ # This model is what lets us convert both our document chunks and the user’s question into vectors in the same space.
17
+ print("Loading embedding model...")
18
+ embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
19
+
20
+
21
+ # In this function, we are loading all the text files from the data folder and turning them into smaller chunks.
22
+ # We chose to split on double newlines so that we roughly stay at the paragraph level instead of entire pages.
23
+ # We also skip very short chunks so the retrieval stays focused on meaningful pieces of text.
24
+ def load_corpus_and_chunks(data_dir: str):
25
+ texts = []
26
+ file_paths = glob.glob(os.path.join(data_dir, "*.txt"))
27
+
28
+ print(f"Found {len(file_paths)} files in {data_dir}")
29
+ for path in file_paths:
30
+ try:
31
+ with open(path, "r", encoding="utf-8") as f:
32
+ content = f.read()
33
+ except UnicodeDecodeError:
34
+ # If UTF-8 fails, we fall back to latin-1 just to be safe, because some PDFs export in odd encodings.
35
+ with open(path, "r", encoding="latin-1") as f:
36
+ content = f.read()
37
+
38
+ # Here we actually split the file into chunks.
39
+ # We keep it simple and split on blank lines, which works nicely for guidelines that are written in short sections.
40
+ for chunk in content.split("\n\n"):
41
+ chunk = chunk.strip()
42
+ # We decided to ignore very short chunks because they usually do not carry enough context.
43
+ if len(chunk) < 100:
44
+ continue
45
+ texts.append(chunk)
46
+
47
+ print(f"Total chunks: {len(texts)}")
48
+ return texts
49
+
50
+
51
+ # When the app starts, we load all the chunks and precompute their embeddings.
52
+ # We do this once at startup so the user does not have to wait for embedding every document on every question.
53
+ corpus_chunks = load_corpus_and_chunks(DATA_DIR)
54
+ corpus_embeddings = embed_model.encode(corpus_chunks, convert_to_numpy=True, show_progress_bar=True)
55
+
56
+ # After we get the embeddings, we normalize them.
57
+ # We decided to normalize so that cosine similarity becomes a simple dot product, which makes the retrieval simpler.
58
+ corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
59
+
60
+
61
+ # This helper function is the core of the retrieval step in our RAG pipeline.
62
+ # We take the user’s question, embed it, normalize it, and then compare it to every chunk using a dot product.
63
+ # Then we sort all the scores and pick the top k chunks as our context.
64
+ def retrieve_relevant_chunks(question: str, k: int = TOP_K):
65
+ q_emb = embed_model.encode([question], convert_to_numpy=True)[0]
66
+ q_emb = q_emb / np.linalg.norm(q_emb)
67
+ scores = np.dot(corpus_embeddings, q_emb)
68
+ top_indices = np.argsort(scores)[::-1][:k]
69
+ results = [corpus_chunks[i] for i in top_indices]
70
+ return results
71
+
72
+
73
+ # Before we generate an answer, we added a small safety layer around the questions.
74
+ # Our idea here is that NutriBud should not try to act like a doctor or dietitian,
75
+ # especially for conditions like diabetes or questions about rapid weight loss.
76
+ # So we wrote a simple keyword-based filter that flags questions as high-risk.
77
+ def is_high_risk_question(question: str) -> bool:
78
+ q = question.lower()
79
+
80
+ # We decided to keep this list of risky keywords simple and readable,
81
+ # since in our assignment the main goal is to show our thinking about safety, not to build a perfect classifier.
82
+ risky_keywords = [
83
+ "exact calories",
84
+ "calorie meal plan",
85
+ "meal plan",
86
+ "lose 20 pounds",
87
+ "lose 10 pounds",
88
+ "rapid weight loss",
89
+ "crash diet",
90
+ "diabetes",
91
+ "diabetic",
92
+ "blood sugar",
93
+ "keto",
94
+ "intermittent fasting",
95
+ "dizzy",
96
+ "faint",
97
+ "fainting",
98
+ "lightheaded",
99
+ "eating disorder",
100
+ "anorexia",
101
+ "bulimia",
102
+ ]
103
+
104
+ return any(word in q for word in risky_keywords)
105
+
106
+
107
+ # This is the message we return whenever our safety check decides that the question is too high-risk.
108
+ # We wrote it in a way that clearly says what NutriBud can and cannot do, and encourages the user
109
+ # to talk to a health professional instead of relying on an AI for personal medical issues.
110
+ def safety_response(question: str) -> str:
111
+ return (
112
+ "I’m NutriBud, a general nutrition helper based on public health guidelines. "
113
+ "I can’t give medical advice, personalized meal plans, or recommendations for specific "
114
+ "conditions like diabetes, dizziness with fasting, or rapid weight loss. "
115
+ "It’s really important to talk to a doctor or a registered dietitian for guidance "
116
+ "that is safe for your health. "
117
+ "If you’d like, you can ask me more general questions about healthy eating patterns, "
118
+ "like ways to eat more vegetables, choose healthier drinks, or limit highly processed foods."
119
+ )
120
+
121
+
122
+ # This function is responsible for building the final answer to non-risky questions using our RAG setup.
123
+ # Instead of calling a large generative model, we decided to keep it more transparent and deterministic.
124
+ # We retrieve the most relevant chunks and then stitch them into a friendly, short answer.
125
+ def build_rag_answer(question: str) -> str:
126
+ # First we get the top K chunks from our corpus based on similarity.
127
+ contexts = retrieve_relevant_chunks(question, k=TOP_K)
128
+
129
+ # We start the answer with a short intro so the user knows the answer is coming from general guidelines.
130
+ intro = (
131
+ "Here’s a general answer based on the trusted nutrition sources we loaded "
132
+ "(like Canada’s Food Guide and similar public health guidance):\n\n"
133
+ )
134
+
135
+ # Then we join the retrieved chunks with spacing so they are readable.
136
+ body = "\n\n".join(contexts)
137
+
138
+ # We also decided to limit the total length so that NutriBud’s responses stay compact inside the chat window.
139
+ full_text = intro + body
140
+ max_len = 1200
141
+ if len(full_text) > max_len:
142
+ truncated = full_text[:max_len]
143
+ if "." in truncated:
144
+ truncated = truncated.rsplit(".", 1)[0] + "."
145
+ full_text = truncated
146
+
147
+ return full_text
148
+
149
+
150
+ # This is the main function that Gradio calls every time the user sends a new message.
151
+ # The "history" parameter is part of the ChatInterface API, but in our design we decided not to use it
152
+ # directly inside the retrieval step because we are focusing on single-turn questions for this assignment.
153
+ def nutri_chat(message: str, history: list):
154
+ # First we check if the question looks high-risk according to our keyword filter.
155
+ if is_high_risk_question(message):
156
+ return safety_response(message)
157
+
158
+ # If not high-risk, we go through the normal RAG pipeline and return a context-based answer.
159
+ return build_rag_answer(message)
160
+
161
+
162
+ # For the user interface, at first we tried using more customized layouts with Blocks and custom CSS.
163
+ # While we were exploring that, we read the Gradio documentation on custom CSS and JS here:
164
+ # https://www.gradio.app/guides/custom-CSS-and-JS
165
+ # However, because Gradio 6 changed some of the arguments (like removing css and theme in some places),
166
+ # those experiments started causing errors during our local testing.
167
+ # To make sure NutriBud is stable and easy to deploy on Hugging Face Spaces,
168
+ # we decided to switch to ChatInterface, which is much simpler but still looks clean.
169
+ demo = gr.ChatInterface(
170
+ fn=nutri_chat,
171
+ title="NutriBud — Healthy Nutrition RAG Chatbot",
172
+ description=(
173
+ "Ask general questions about healthy eating, like:\n"
174
+ "• How can I eat more vegetables every day?\n"
175
+ "• What are healthier drink choices instead of sugary drinks?\n"
176
+ "NutriBud uses a retrieval-augmented approach on trusted public health documents.\n"
177
+ "It does not give medical advice or personalized meal plans."
178
+ ),
179
+ )
180
+
181
+
182
+ # At the end, we launch the app in the usual Gradio way.
183
+ # When we are testing locally on our laptops, we run `python app.py` and open the local URL
184
+ # (for example http://127.0.0.1:7860) in our browser.
185
+ # On Hugging Face Spaces, we keep this structure because the Space will call this file as the main entry point.
186
+ if __name__ == "__main__":
187
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers
4
+ sentence-transformers
5
+ numpy