Spaces:
Sleeping
Sleeping
| import os | |
| import glob | |
| import numpy as np | |
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| # here we define our basic setup. we decided to store all nutrition text files inside a folder called data | |
| # this lets us update content later just by dropping new .txt files without changing any code | |
| DATA_DIR = "data" | |
| TOP_K = 3 | |
| # when the app starts, we load the embedding model once | |
| # we chose this model because it works well on cpu and still gives good semantic retrieval | |
| print("loading embedding model...") | |
| embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| def load_corpus_and_chunks(data_dir: str): | |
| """ | |
| here we load all text files, read their content, and split them into meaningful chunks | |
| we use paragraph-level chunks so the answers have enough context without being too long | |
| """ | |
| texts = [] | |
| file_paths = glob.glob(os.path.join(data_dir, "*.txt")) | |
| print(f"found {len(file_paths)} files in {data_dir}") | |
| for path in file_paths: | |
| try: | |
| with open(path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| except UnicodeDecodeError: | |
| # some exported files use weird encodings, so we added this fallback during debugging | |
| with open(path, "r", encoding="latin-1") as f: | |
| content = f.read() | |
| # we decided to split on double newlines because most of our nutrition sources are written in short sections | |
| for chunk in content.split("\n\n"): | |
| chunk = chunk.strip() | |
| # we ignore very short chunks because they rarely help with retrieval | |
| if len(chunk) < 100: | |
| continue | |
| texts.append(chunk) | |
| print(f"total chunks: {len(texts)}") | |
| return texts | |
| # here we load all corpus chunks and precompute their embeddings | |
| # during deployment we discovered that if data is empty, normalizing the embeddings crashes the app | |
| # so we added this guard and a safe no-knowledge mode | |
| corpus_chunks = load_corpus_and_chunks(DATA_DIR) | |
| if len(corpus_chunks) > 0: | |
| corpus_embeddings = embed_model.encode( | |
| corpus_chunks, convert_to_numpy=True, show_progress_bar=True | |
| ) | |
| corpus_embeddings = corpus_embeddings / np.linalg.norm( | |
| corpus_embeddings, axis=1, keepdims=True | |
| ) | |
| else: | |
| corpus_embeddings = None | |
| print("warning: no documents found. nutribud will run in no-knowledge mode.") | |
| def retrieve_relevant_chunks(question: str, k: int = TOP_K): | |
| """ | |
| this is our rag retrieval step | |
| we embed the user question, compare it with all document chunks, and return the top k matches | |
| if there is no data, we simply return an empty list and explain that to the user | |
| """ | |
| if corpus_embeddings is None or len(corpus_chunks) == 0: | |
| return [] | |
| q_emb = embed_model.encode([question], convert_to_numpy=True)[0] | |
| q_emb = q_emb / np.linalg.norm(q_emb) | |
| scores = np.dot(corpus_embeddings, q_emb) | |
| top_indices = np.argsort(scores)[::-1][:k] | |
| results = [corpus_chunks[i] for i in top_indices] | |
| return results | |
| # we added this safety filter so nutribud does not act like a doctor or give risky advice | |
| # we focused on obvious high-risk keywords that showed up in our brainstorming (rapid weight loss, diabetes, etc.) | |
| def is_high_risk_question(question: str) -> bool: | |
| q = question.lower() | |
| risky_keywords = [ | |
| "exact calories", | |
| "calorie meal plan", | |
| "meal plan", | |
| "lose 20 pounds", | |
| "lose 10 pounds", | |
| "rapid weight loss", | |
| "crash diet", | |
| "diabetes", | |
| "diabetic", | |
| "blood sugar", | |
| "keto", | |
| "intermittent fasting", | |
| "dizzy", | |
| "faint", | |
| "fainting", | |
| "lightheaded", | |
| "eating disorder", | |
| "anorexia", | |
| "bulimia", | |
| ] | |
| return any(word in q for word in risky_keywords) | |
| # this is the message we show if the safety check triggers | |
| # we wrote it to be kind but firm about nutribud’s limits and to redirect users to real professionals | |
| def safety_response(question: str) -> str: | |
| return ( | |
| "i’m NutriBud, a general nutrition helper based on public health guidelines. " | |
| "i can’t give medical advice, personalized meal plans, or recommendations for specific " | |
| "conditions like diabetes, dizziness with fasting, or rapid weight loss. " | |
| "it’s really important to talk to a doctor or a registered dietitian for guidance " | |
| "that is safe for your health.\n\n" | |
| "if you’d like, you can ask me more general questions about healthy eating patterns, " | |
| "like ways to eat more vegetables, choose healthier drinks, or limit highly processed foods." | |
| ) | |
| # this function builds the final rag-based answer | |
| # we stitch together an intro plus the most relevant chunks, and we trim if the answer gets too long | |
| def build_rag_answer(question: str) -> str: | |
| contexts = retrieve_relevant_chunks(question, k=TOP_K) | |
| if not contexts: | |
| return ( | |
| "right now, NutriBud doesn’t have any nutrition documents loaded.\n\n" | |
| "on the backend, we look up answers inside .txt files stored in the data folder of this space. " | |
| "to enable full answers, please add trusted nutrition documents there and restart the app." | |
| ) | |
| intro = ( | |
| "here’s a general answer based on the nutrition sources we loaded " | |
| "(for example canada’s food guide and similar public health material):\n\n" | |
| ) | |
| body = "\n\n".join(contexts) | |
| full_text = intro + body | |
| max_len = 1200 | |
| if len(full_text) > max_len: | |
| truncated = full_text[:max_len] | |
| if "." in truncated: | |
| truncated = truncated.rsplit(".", 1)[0] + "." | |
| full_text = truncated | |
| return full_text | |
| # this is the core chat function. gradio will give us the message and the current chat history | |
| # in our design, we ignore the history for retrieval because we focus on single-turn questions | |
| def nutri_chat(message: str, history: list) -> str: | |
| if not message or not message.strip(): | |
| return "please type a question about healthy eating to chat with nutribud." | |
| if is_high_risk_question(message): | |
| return safety_response(message) | |
| return build_rag_answer(message) | |
| # here we use gradio's chatinterface so the layout feels like a normal chat app | |
| # our earlier attempts to style themes directly caused version errors, so we keep it simple here | |
| demo = gr.ChatInterface( | |
| fn=nutri_chat, | |
| title="🌿 NutriBud: Friendly Nutrition RAG Chatbot 🌿", | |
| description=( | |
| "Ask NutriBud questions about healthy eating and it will answer using trusted public health documents.\n" | |
| "Nutribud does not give medical advice or personalized meal plans." | |
| ), | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |