pikam00 commited on
Commit
fd6d7e3
·
verified ·
1 Parent(s): 60817f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -215
app.py CHANGED
@@ -1,228 +1,117 @@
1
  import gradio as gr
 
2
  from huggingface_hub import InferenceClient
 
3
  from sentence_transformers import SentenceTransformer
4
  import torch
5
- import random
6
- import re
7
 
8
- # ===== Models =====
9
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
10
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
11
-
12
- # ===== Load & sanitize corpus =====
13
- with open("journal.txt", "r", encoding="utf-8") as f:
14
- raw_text = f.read()
15
-
16
- ROLE_TAGS = re.compile(
17
- r'\[/?(?:USER|ASST)\]|\</?(?:user|assistant)\>|<\|(?:user|assistant)\|>',
18
- re.IGNORECASE,
19
- )
20
-
21
- def clean_corpus(text: str) -> str:
22
- text = ROLE_TAGS.sub("", text or "")
23
- out = []
24
- for line in text.splitlines():
25
- low = line.strip().lower()
26
- if low.startswith("user wrote:"): continue
27
- if low.startswith("/user wrote:"): continue
28
- if low.startswith("assistant wrote:"): continue
29
- if low.startswith("/assistant wrote:"): continue
30
- if low.startswith("user:"): continue
31
- if low.startswith("assistant:"): continue
32
- out.append(line)
33
- return "\n".join(out)
34
-
35
- journal_text = clean_corpus(raw_text)
36
-
37
- # ===== Chunk + embed (safe if file is short/empty) =====
38
- def preprocess_text(text: str):
39
- cleaned = (text or "").strip()
40
- if not cleaned:
41
- return []
42
- sents = [s.strip() for s in cleaned.split(".") if s.strip()]
43
- sentence_chunks = [s for s in sents if len(s) > 10]
44
-
45
- combined = []
46
- for i in range(0, len(sents), 3):
47
- chunk = ". ".join(sents[i:i+3]).strip()
48
  if len(chunk) > 20:
49
- combined.append(chunk)
50
-
51
- paras = [p.strip() for p in cleaned.split("\n\n") if p.strip() and len(p) > 30]
52
-
53
- seen, chunks = set(), []
54
- for c in sentence_chunks + combined + paras:
55
- c = c.strip()
56
- if c and c not in seen and len(c) > 15:
57
- seen.add(c)
58
- chunks.append(c)
59
- return chunks
60
-
61
- chunks = preprocess_text(journal_text)
62
- HAS_CORPUS = len(chunks) > 0
63
- embeddings = embedder.encode(chunks, convert_to_tensor=True) if HAS_CORPUS else None
64
-
65
- def get_top_chunks(query: str, top_k: int = 5):
66
- if not (HAS_CORPUS and embeddings is not None and query):
67
- return []
68
- q = embedder.encode(query, convert_to_tensor=True)
69
- q = q / q.norm()
70
- M = embeddings / embeddings.norm(dim=1, keepdim=True)
71
- n = len(chunks)
72
- if n == 0:
73
- return []
74
- k = max(1, min(top_k, n))
75
- sims = torch.matmul(M, q)
76
- scores, idxs = torch.topk(sims, k=k)
 
 
 
 
 
 
 
 
 
 
77
  results = []
78
- for i, idx in enumerate(idxs):
79
- if scores[i].item() > 0.25:
80
- results.append(chunks[int(idx)])
81
- return results
82
-
83
- def join_context(chunks_list, max_chars=900):
84
- out = ""
85
- for c in chunks_list:
86
- c = c.strip()
87
- if len(out) + len(c) + 2 > max_chars:
88
- break
89
- out += (("\n\n" if out else "") + c)
90
- return out
91
-
92
- # ===== Tiny safety =====
93
- CRISIS_TERMS = ["suicide","kill myself","end my life","self-harm","hurt myself","overdose","harm others","kill someone"]
94
- def is_crisis(msg: str) -> bool:
95
- m = (msg or "").lower()
96
- return any(t in m for t in CRISIS_TERMS)
97
-
98
- # ===== Emotion gate & extraction =====
99
- EMOTION_HINTS = [
100
- "i feel", "i'm feeling", "i am feeling", "feel", "feeling",
101
- "overwhelmed", "stressed", "anxious", "sad", "lonely",
102
- "angry", "upset", "worried", "guilty", "ashamed",
103
- "proud", "happy", "excited", "tired", "burned out", "burnt out"
104
- ]
105
-
106
- def mentions_emotion(msg: str) -> bool:
107
- m = (msg or "").lower()
108
- return any(k in m for k in EMOTION_HINTS)
109
-
110
- # normalize common typos like "jm sad" -> "i'm sad", "im sad" -> "i'm sad"
111
- def normalize(msg: str) -> str:
112
- m = msg.strip()
113
- m = re.sub(r"^\s*jm\b", "I'm", m, flags=re.IGNORECASE)
114
- m = re.sub(r"\bim\b", "I'm", m, flags=re.IGNORECASE)
115
- return m
116
-
117
- # very simple extraction: try to grab phrase after "I feel/I'm feeling/feeling ..."
118
- EMO_RE = re.compile(
119
- r"\b(i\s*feel|i\s*am\s*feeling|i'm\s*feeling|im\s*feeling|feeling)\s+([^.,;!?]{1,40})",
120
- re.IGNORECASE
121
- )
122
- # fallback list if no phrase captured
123
- EMO_WORDS = [
124
- "overwhelmed","stressed","anxious","sad","lonely","angry","upset",
125
- "worried","guilty","ashamed","proud","happy","excited","tired",
126
- "burned out","burnt out"
127
  ]
128
 
129
- def extract_emotion(msg: str) -> str:
130
- m = normalize(msg)
131
- m_low = m.lower()
132
- m = m.strip()
133
- # try regex phrase
134
- hit = EMO_RE.search(m)
135
- if hit:
136
- phrase = hit.group(2).strip()
137
- # keep it short and clean
138
- phrase = re.sub(r"\s+", " ", phrase)
139
- return phrase
140
- # fallback: first known word present
141
- for w in EMO_WORDS:
142
- if w in m_low:
143
- return w
144
- return "this way" # last resort
145
-
146
- # ===== Tiny break ideas (only when feelings are mentioned) =====
147
- BREAKS = [
148
- "Try box breathing 4-4-4-4 for 60 seconds.",
149
- "Unclench your jaw and roll your shoulders slowly three times.",
150
- "Look away from the screen and name 5 things you can see.",
151
- "Sip water slowly and take three deep breaths.",
152
- "Stand up, stretch overhead, and feel your feet on the ground."
153
- ]
154
- def pick_break():
155
- return random.choice(BREAKS)
156
 
157
- # ===== Chat handler =====
158
  def respond(message, history):
159
- msg = (message or "").strip()
160
- if not msg:
161
- return "Hey, I’m Otium. I’m here to listen whenever you want to talk about your day or how you’re feeling."
162
-
163
- # Safety
164
- if is_crisis(msg):
165
- return (
166
- "I’m glad you reached out. I’m not a crisis service, but help is available:\n"
167
- "• U.S.: call or text 988 (988lifeline.org)\n"
168
- "• Elsewhere: contact local emergency services."
169
- )
170
-
171
- # If no emotions yet → friendly hello only
172
- if not mentions_emotion(msg):
173
- return ("Hey, I’m Otium. I’m here to listen whenever you want to talk about your day "
174
- "or how you’re feeling. No pressure—share only when you’re ready.")
175
-
176
- # Emotions present → retrieve (if any) + short support
177
- emo = extract_emotion(msg)
178
- context_block = join_context(get_top_chunks(msg, top_k=5)) if HAS_CORPUS else ""
179
-
180
- system_msg = (
181
- "You are Otium, a warm journaling buddy. Not medical advice. "
182
- "Output plain text only (no role labels or chat logs). "
183
- "Reflect the user’s feelings in simple, kind language. "
184
- "Ask exactly ONE question phrased as: 'Why do you feel {emotion}?', "
185
- "where {emotion} is the extracted emotion provided below. "
186
- "Keep the reply short (3–5 sentences) and end with one tiny break idea. "
187
- "Avoid clinical terms or medical guidance.\n\n"
188
- f"Extracted emotion: {emo}\n"
189
- )
190
- if context_block:
191
- system_msg += f"\nHelpful snippets from the user's content:\n{context_block}"
192
-
193
- # Build messages for the model
194
- messages = [{"role": "system", "content": system_msg}]
195
  if history:
196
- for u, a in history:
197
- if u: messages.append({"role": "user", "content": u})
198
- if a: messages.append({"role": "assistant", "content": a})
199
- messages.append({"role": "user", "content": normalize(msg)})
200
-
201
- # Call model, with stop strings to avoid chat-log artifacts
202
- try:
203
- resp = client.chat_completion(
204
- messages=messages,
205
- max_tokens=220,
206
- temperature=0.7,
207
- stop=["User wrote:", "Assistant wrote:", "User:", "Assistant:"]
208
- )
209
- text = resp["choices"][0]["message"]["content"].strip()
210
- except Exception:
211
- # Friendly fallback if API hiccups
212
- text = f"Thanks for sharing that. Why do you feel {emo}?"
213
-
214
- # Guarantee the explicit question appears (belt-and-suspenders)
215
- if f"Why do you feel {emo}?" not in text:
216
- text = text.rstrip(".! ") + f"\n\nWhy do you feel {emo}?"
217
-
218
- return f"{text}\n\n**Tiny break idea:** {pick_break()}"
219
-
220
- # ===== Minimal UI =====
221
- chatbot = gr.ChatInterface(
222
- respond,
223
- title="Otium — A Friendly Check-In",
224
- description="Say hello whenever you’re ready. Otium only offers support once you talk about feelings. (Not medical advice.)"
225
- )
226
-
227
- if __name__ == "__main__":
228
- chatbot.launch()
 
1
  import gradio as gr
2
+ import random
3
  from huggingface_hub import InferenceClient
4
+ # import lines go at the top: any libraries I need to import go up here ^^
5
  from sentence_transformers import SentenceTransformer
6
  import torch
 
 
7
 
 
8
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
9
+
10
+ # Step 1
11
+ with open("Untitled document.txt", "r", encoding="utf-8") as f:
12
+ skincare_text = f.read()
13
+
14
+ # Step 2: Preprocess text into sentence chunks
15
+ def preprocess_text(text):
16
+ cleaned_text = text.strip()
17
+ sentences = [s.strip() for s in cleaned_text.split('.') if s.strip()]
18
+ sentence_chunks = [s.strip() for s in sentences if len(s.strip()) > 10]
19
+
20
+ combined_chunks = []
21
+ for i in range(0, len(sentences), 2):
22
+ chunk = '. '.join(sentences[i:i+3]).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  if len(chunk) > 20:
24
+ combined_chunks.append(chunk)
25
+
26
+ paragraphs = [p.strip() for p in cleaned_text.split('\n\n') if p.strip()]
27
+ paragraph_chunks = [p for p in paragraphs if len(p) > 30]
28
+
29
+ all_chunks = sentence_chunks + combined_chunks + paragraph_chunks
30
+
31
+ seen = set()
32
+ final_chunks = []
33
+ for chunk in all_chunks:
34
+ if chunk not in seen and len(chunk) > 15:
35
+ seen.add(chunk)
36
+ final_chunks.append(chunk)
37
+
38
+ print(f"Created {len(final_chunks)} chunks using advanced strategy")
39
+ print(f"Sample chunks: {final_chunks[:3]}")
40
+ return final_chunks
41
+
42
+ cleaned_chunks = preprocess_text(skincare_text)
43
+
44
+ # Step 3: Convert chunks into embeddings
45
+ model = SentenceTransformer('all-MiniLM-L6-v2')
46
+
47
+ def create_embeddings(text_chunks):
48
+ chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True)
49
+ print(f"Embeddings shape: {chunk_embeddings.shape}")
50
+ return chunk_embeddings
51
+
52
+ chunk_embeddings = create_embeddings(cleaned_chunks)
53
+
54
+ # Step 4: Retrieve top matching chunks
55
+ def get_top_chunks(query, chunk_embeddings, text_chunks, top_k=3):
56
+ query_embedding = model.encode(query, convert_to_tensor=True)
57
+ query_norm = query_embedding / query_embedding.norm()
58
+ chunks_norm = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
59
+ similarities = torch.matmul(chunks_norm, query_norm)
60
+ top_scores, top_indices = torch.topk(similarities, k=min(top_k, len(text_chunks)))
61
+
62
  results = []
63
+ for i, idx in enumerate(top_indices):
64
+ score = top_scores[i].item()
65
+ if score > 0.3: # Only include reasonably relevant chunks
66
+ results.append(text_chunks[idx])
67
+
68
+ return results, top_scores[:len(results)]
69
+
70
+ # Step 5: Relevance checker
71
+ def is_skincare_related(query):
72
+ skincare_keywords = [
73
+ 'skin', 'skincare', 'acne', 'wrinkles', 'moisturizer', 'cleanser',
74
+ 'sunscreen', 'serum', 'retinol', 'vitamin', 'dry', 'oily', 'sensitive',
75
+ 'aging', 'pores', 'blackheads', 'routine', 'face', 'facial', 'beauty',
76
+ 'dermatology', 'cosmetic', 'cream', 'lotion', 'toner', 'exfoliate',
77
+ 'hydration', 'anti-aging', 'blemish', 'spot', 'dark circles'
78
+ ]
79
+
80
+ query_lower = query.lower()
81
+ return any(keyword in query_lower for keyword in skincare_keywords)
82
+
83
+ queries = [
84
+ "Consistent skincare routine",
85
+ "Applying sunscreen daily",
86
+ "Choosing products that match your skin type"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  ]
88
 
89
+ for q in queries:
90
+ print(f"\nQuery: {q}")
91
+ results = get_top_chunks(q, chunk_embeddings, cleaned_chunks)
92
+ for idx, res in enumerate(results, 1):
93
+ print(f"Result {idx}: {res}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
 
95
  def respond(message, history):
96
+ top_results = get_top_chunks(message, chunk_embeddings, cleaned_chunks)
97
+ print(top_results)
98
+
99
+ messages = [{"role": "system", "content": f"You are a friendly chatbot. You give people advice about skincare. Base your response on the following information: {top_results}"}]
100
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  if history:
102
+ messages.extend(history)
103
+ messages.append({"role": "user", "content": message})
104
+
105
+ response = client.chat_completion(messages, max_tokens=100)
106
+ return response['choices'][0]['message']['content'].strip()
107
+
108
+ def echo(message, history):
109
+ return message
110
+
111
+ def yes_or_no(message, history):
112
+ return random.choice(['Yes', 'No', 'Maybe', 'Ask Again'])
113
+
114
+ chatbot = gr.ChatInterface(respond)
115
+ # defining my chatbot so that the user can interact and see their conversation history and send new messages
116
+
117
+ chatbot.launch()