Spaces:

chitech2026
/

sandbox_dev

Sleeping

App Files Files Community

igortech commited on Sep 4, 2025

Commit

db4315a

verified ·

1 Parent(s): 6ce6baf

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -344

app.py CHANGED Viewed

@@ -1,416 +1,215 @@
 import json
-import os
-import datetime
-import csv
 import random
 import re
 import string
-import urllib.parse
 import gradio as gr
-from difflib import SequenceMatcher
 # -----------------------------
-# Config
 # -----------------------------
 DATA_PATH = "quotes.json"
-MIN_SCORE = 1     # minimum score to consider a match (used after scoring)
-TOP_N = 5         # how many top quotes to consider for fusion
-# -----------------------------
-# Utility: load & normalize data
-# -----------------------------
 def load_quotes():
-    """
-    Loads quotes.json and accepts both:
-      - new schema: { "food": ["quote1", "quote2", ...], ... }
-      - old schema: { "food": { "positive":[..], "negative":[..] } , ... }
-    Converts old schema into list form by concatenation.
-    """
-    if not os.path.exists(DATA_PATH):
-        print(f"{DATA_PATH} not found.")
-        return {}
-    try:
-        with open(DATA_PATH, "r", encoding="utf-8") as f:
-            data = json.load(f)
-        if not isinstance(data, dict):
-            print(f"{DATA_PATH} root must be an object/dict.")
-            return {}
-        normalized = {}
-        for cat, val in data.items():
-            if isinstance(val, list):
-                normalized[cat] = [str(q).strip() for q in val if isinstance(q, str) and q.strip()]
-            elif isinstance(val, dict):
-                # old schema; flatten positive/negative lists (preserve order)
-                merged = []
-                for k in ("positive", "negative"):
-                    items = val.get(k, [])
-                    if isinstance(items, list):
-                        merged.extend([str(q).strip() for q in items if isinstance(q, str) and q.strip()])
-                normalized[cat] = merged
-            else:
-                # unsupported format
-                normalized[cat] = []
-        print(f"Loaded {len(normalized)} categories from {DATA_PATH}.")
-        return normalized
-    except Exception as e:
-        print(f"Error loading {DATA_PATH}: {e}")
-        return {}
-QUOTES = load_quotes()
 # -----------------------------
-# Text helpers & scoring
 # -----------------------------
-punct_re = re.compile(f"[{re.escape(string.punctuation)}]")
 STOPWORDS = {
     "the","a","an","and","or","but","if","then","so","than","to","of","in","on","at","for",
     "is","are","was","were","be","being","been","it","that","this","these","those","with",
     "as","by","from","about","into","over","after","before","up","down","out"
 }
-POS_HINTS = {"good","great","love","like","enjoy","awesome","amazing","nice","positive","best","fantastic","excellent","delicious","fresh","helpful"}
-NEG_HINTS = {"bad","hate","dislike","worst","awful","terrible","negative","poor","meh","gross","unsafe","hard","difficult","crowded","bland","stale"}
 def normalize(text: str) -> str:
-    return punct_re.sub(" ", (text or "").lower()).strip()
 def tokenize(text: str):
     return [t for t in normalize(text).split() if t and t not in STOPWORDS]
-def seq_similarity(a: str, b: str) -> float:
-    return SequenceMatcher(None, a, b).ratio()
-def score_quote_against_query(quote: str, query: str) -> int:
-    """Weighted integer score: token overlap (weighted) + sequence-sim tie-breaker."""
-    q_tokens = set(tokenize(quote))
-    u_tokens = set(tokenize(query))
-    overlap = len(q_tokens & u_tokens)
-    seq_sim = seq_similarity(normalize(quote), normalize(query))
-    score = overlap * 3 + int(seq_sim * 3)
-    return score
-def find_top_matches(category: str, user_text: str, top_n: int = TOP_N):
-    """Return list of (quote, score) sorted desc for given category."""
-    quotes = QUOTES.get(category, [])
-    scored = []
-    for q in quotes:
-        sc = score_quote_against_query(q, user_text)
-        scored.append((q, sc))
-    scored.sort(key=lambda x: x[1], reverse=True)
-    # filter low scores
-    filtered = [t for t in scored if t[1] >= MIN_SCORE]
-    return filtered[:top_n]
 # -----------------------------
-# Fusion helpers
-# -----------------------------
-def pick_pivot_token(quotes, user_tokens):
-    freq = {}
-    for q, _sc in quotes:
-        toks = tokenize(q)
-        for t in toks:
-            if t in STOPWORDS: continue
-            freq[t] = freq.get(t, 0) + 1
-    for t in user_tokens:
-        if t in freq:
-            return t
-    if not freq: return None
-    return max(freq.items(), key=lambda x: x[1])[0]
-def extract_traits_for_pivot(quote, pivot):
-    toks = tokenize(quote)
-    trait_tokens = [t for t in toks if t != pivot and t not in STOPWORDS]
-    if not trait_tokens: return None
-    return " ".join(trait_tokens[:5])
-def fuse_quotes(top_matches, user_text):
-    """
-    Build a compact fused sentence from top_matches (list of (quote, score)).
-    Strategy:
-      - Determine pivot token from matches and user_text.
-      - Extract short traits from each match and combine into: "The <pivot> is X, Y and Z."
-      - Fallback: return first match (trimmed) or small snippets.
-    """
-    if not top_matches:
-        return "No relevant quotes found."
-    # If only one quote, return it directly (trim to reasonable length)
-    if len(top_matches) == 1:
-        q = top_matches[0][0]
-        return q if len(q) <= 300 else q[:300] + "..."
-    user_toks = set(tokenize(user_text))
-    pivot = pick_pivot_token(top_matches, user_toks)
-    traits = []
-    used = set()
-    for q, _sc in top_matches:
-        tr = extract_traits_for_pivot(q, pivot) if pivot else None
-        if tr and tr not in used:
-            traits.append(tr)
-            used.add(tr)
-    if pivot and traits:
-        if len(traits) == 1:
-            trait_text = traits[0]
-        else:
-            trait_text = ", ".join(traits[:-1]) + " and " + traits[-1]
-        return f"The {pivot} is {trait_text}."
-    else:
-        # fallback: give short snippets from up to 3 quotes
-        snippets = []
-        for q, _sc in top_matches[:3]:
-            tokens = tokenize(q)
-            short = " ".join(tokens[:18])
-            if short not in snippets:
-                snippets.append(short + ("..." if len(tokens) > 18 else ""))
-        return " / ".join(snippets)
 # -----------------------------
-# Assemble 3-tier response
-# -----------------------------
-def assemble_response(category: str, user_text: str, domain_restrict: str = ""):
     if category not in QUOTES:
-        return ("Summary: No category found.", "What real people say:\nNo data for this category.", "For more info: (no results)")
-    matches = find_top_matches(category, user_text, top_n=TOP_N)
-    if not matches:
-        summary = f"Summary: I couldn't find close matches in '{category}' for that question."
-        details = "What real people say:\nNo matching interview snippets available."
-        search_q = f"{category} college reviews"
-        link = "https://www.google.com/search?q=" + urllib.parse.quote_plus(search_q)
-        return (summary, details, f"For more info: {link}")
-    # Evaluate sentiment lean among matches
-    pos_count = 0
-    neg_count = 0
-    for q, _sc in matches:
-        qn = normalize(q)
-        if any(p in qn for p in POS_HINTS): pos_count += 1
-        if any(n in qn for n in NEG_HINTS): neg_count += 1
-    total = max(pos_count + neg_count, 1)
-    if pos_count > neg_count:
-        summary = f"Summary: Matching interview snippets tend to express positive views about {category}."
-    elif neg_count > pos_count:
-        summary = f"Summary: Matching snippets express concerns or negative points about {category}."
-    else:
-        summary = f"Summary: Matching snippets include both positive and negative remarks about {category}."
-    # Tier 2: fusion
-    fusion_text = fuse_quotes(matches, user_text)
-    details = "What real people say:\n" + fusion_text
-    # Tier 3: domain-aware search link
-    if domain_restrict and domain_restrict.strip():
-        # user supplied domain -> site search
-        q = f"site:{domain_restrict.strip()} {category}"
-        link = "https://www.google.com/search?q=" + urllib.parse.quote_plus(q)
-    else:
-        q = f"{category} college reviews"
-        link = "https://www.google.com/search?q=" + urllib.parse.quote_plus(q)
-    footer = f"For more info: {link}"
-    return (summary, details, footer)
-# -----------------------------
-# Conversation logging & export
-# -----------------------------
-conversation_log = []  # list of dicts: timestamp, category, user_message, response
-def export_conversation_csv():
-    fname = "conversation_log.csv"
-    try:
-        with open(fname, "w", newline="", encoding="utf-8") as csvfile:
-            fieldnames = ["timestamp", "category", "user_message", "response"]
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(conversation_log)
-        return fname
-    except Exception as e:
-        print("CSV export error:", e)
-        return None
-# -----------------------------
-# Staging (collect responses into a category)
-# -----------------------------
-def stage_conversation_to_category(target_category: str):
-    """
-    Append conversation responses into the chosen staging category in memory (QUOTES).
-    Returns a status message.
-    """
-    if not target_category or not isinstance(target_category, str):
-        return "Invalid staging category name."
-    added = 0
-    QUOTES.setdefault(target_category, [])
-    existing = set(QUOTES[target_category])
-    for entry in conversation_log:
-        resp = entry.get("response") or entry.get("bot_response") or ""
-        if resp and resp not in existing:
-            QUOTES[target_category].append(resp)
-            existing.add(resp)
-            added += 1
-    return f"Added {added} unique responses to category '{target_category}'."
-def save_quotes_to_disk():
-    """
-    Overwrites DATA_PATH with current QUOTES in the new schema: {category: [quote, ...], ...}
-    Returns filepath on success or None.
-    """
-    try:
-        with open(DATA_PATH, "w", encoding="utf-8") as f:
-            json.dump(QUOTES, f, ensure_ascii=False, indent=2)
-        return DATA_PATH
-    except Exception as e:
-        print("Error saving quotes.json:", e)
-        return None
-# -----------------------------
-# Chat respond callback (preserve chat UI behavior)
-# -----------------------------
-def respond(user_message, chat_history, category, domain_restrict):
-    if chat_history is None:
-        chat_history = []
     if not QUOTES:
-        bot = "No dataset loaded. Please upload quotes.json first."
-        chat_history.append((user_message, bot))
-        return "", chat_history
     if not category:
         bot = "Please select a category."
-        chat_history.append((user_message, bot))
-        return "", chat_history
-    # build 3-tier response
-    summary, details, footer = assemble_response(category, user_message, domain_restrict)
-    combined = f"{summary}\n\n{details}\n\n{footer}"
-    # append to chat and log
-    chat_history.append((user_message, combined))
-    conversation_log.append({
-        "timestamp": datetime.datetime.now().isoformat(),
-        "category": category,
-        "user_message": user_message,
-        "response": combined
-    })
-    return "", chat_history
-# -----------------------------
-# File upload / download helpers
-# -----------------------------
 def upload_json(filepath):
-    global QUOTES
     try:
         with open(filepath, "r", encoding="utf-8") as f:
             data = json.load(f)
-        # Normalize to new schema if necessary
-        if isinstance(data, dict):
-            normalized = {}
-            for cat, val in data.items():
-                if isinstance(val, list):
-                    normalized[cat] = [str(q).strip() for q in val if isinstance(q, str) and q.strip()]
-                elif isinstance(val, dict):
-                    merged = []
-                    for k in ("positive", "negative"):
-                        items = val.get(k, [])
-                        if isinstance(items, list):
-                            merged.extend([str(q).strip() for q in items if isinstance(q, str) and q.strip()])
-                    normalized[cat] = merged
-                else:
-                    normalized[cat] = []
-            QUOTES = normalized
-            cats = sorted(list(QUOTES.keys()))
-            status = f"Loaded {len(cats)} categories from uploaded file."
-            return status, gr.update(choices=cats, value=(cats[0] if cats else None))
-        else:
-            return "Upload failed: JSON root must be an object/dict.", gr.update(choices=[])
     except Exception as e:
         return f"Error loading file: {e}", gr.update(choices=[])
-def download_quotes_file():
-    """Return the DATAPATH for download (write if necessary)."""
-    # Ensure current in-memory QUOTES is saved so download gets updated content
-    saved = save_quotes_to_disk()
-    return saved
 # -----------------------------
-# Gradio UI (keep chat area layout similar to your previous working UI)
 # -----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 🎓 College Life Chatbot — Interview-style quotes")
-    # Category dropdown (keys from QUOTES)
     initial_categories = sorted(list(QUOTES.keys()))
     with gr.Row():
-        category = gr.Dropdown(label="Category", choices=initial_categories, value=(initial_categories[0] if initial_categories else None))
-        domain_restrict = gr.Textbox(label="Restrict article search to domain (optional)", placeholder="example.com (optional)")
-    # Chat area (unchanged look)
     chatbot = gr.Chatbot(label="Conversation", height=360, type="tuples")
     msg = gr.Textbox(label="Your message", placeholder="Ask something like: 'Is food good in college?'", autofocus=True)
     send = gr.Button("Send")
     clear = gr.Button("Clear")
-    # Data I/O row(s): uploader, dataset download, conversation export (stable Button -> File pattern)
-    with gr.Row():
-        uploader = gr.File(label="Upload dataset (.json)", file_types=[".json"], type="filepath")
-        upload_status = gr.Textbox(label="Upload status", interactive=False)
     with gr.Row():
-        download_btn = gr.Button("Download current quotes.json")
-        download_file = gr.File(label="Dataset file (click to download)", interactive=False)
-        export_btn = gr.Button("Export conversation (.csv)")
-        export_file = gr.File(label="Conversation CSV (click to download)", interactive=False)
-    # Staging controls (new)
-    with gr.Row():
-        staging_name = gr.Textbox(label="Staging category name", value="staged_responses", placeholder="Category name to stage exported replies")
-        stage_btn = gr.Button("Stage conversation to category")
         stage_status = gr.Textbox(label="Stage status", interactive=False)
     with gr.Row():
-        save_btn = gr.Button("Save quotes.json (write staged to disk)")
-        save_status = gr.Textbox(label="Save status / download", interactive=False)
-    # Wiring chat events (preserve send/enter behavior)
-    msg.submit(respond, [msg, chatbot, category, domain_restrict], [msg, chatbot])
-    send.click(respond, [msg, chatbot, category, domain_restrict], [msg, chatbot])
-    clear.click(lambda: None, None, chatbot, queue=False)
-    # Upload wiring
-    uploader.upload(upload_json, uploader, [upload_status, category])
-    # Downloads + exports wiring
-    download_btn.click(download_quotes_file, outputs=download_file)
-    export_btn.click(export_conversation_csv, outputs=export_file)
-    # Staging wiring
-    def _stage_and_return_status(name):
-        msg = stage_conversation_to_category(name)
-        # reflect updated categories in dropdown
-        return msg, gr.update(choices=sorted(list(QUOTES.keys())), value=name)
-    stage_btn.click(_stage_and_return_status, inputs=staging_name, outputs=[stage_status, category])
-    def _save_and_report():
-        path = save_quotes_to_disk()
-        if path:
-            return f"Saved to {path}"
-        return "Failed to save quotes.json"
-    save_btn.click(_save_and_report, outputs=save_status)
 # -----------------------------
 # Startup log
 # -----------------------------
 print(f"===== Application Startup at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")
 if QUOTES:
-    for cat, quotes in QUOTES.items():
-        print(f" - {cat}: {len(quotes)} quotes")
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import json
 import random
 import re
 import string
+import os
+import datetime
+import difflib
 import gradio as gr
 # -----------------------------
+# Config / data loading
 # -----------------------------
 DATA_PATH = "quotes.json"
 def load_quotes():
+    if os.path.exists(DATA_PATH):
+        try:
+            with open(DATA_PATH, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            # Ensure staged_responses bucket always exists
+            if "staged_responses" not in data:
+                data["staged_responses"] = []
+            print(f"Loaded dataset from {DATA_PATH} with {len(data)} categories.")
+            return data, DATA_PATH
+        except Exception as e:
+            print(f"Failed to load {DATA_PATH}: {e}")
+    # fallback: empty
+    return {"staged_responses": []}, None
+QUOTES, DATA_PATH = load_quotes()
 # -----------------------------
+# Text helpers
 # -----------------------------
 STOPWORDS = {
     "the","a","an","and","or","but","if","then","so","than","to","of","in","on","at","for",
     "is","are","was","were","be","being","been","it","that","this","these","those","with",
     "as","by","from","about","into","over","after","before","up","down","out"
 }
+POS_HINTS = {"good","great","love","like","enjoy","awesome","amazing","nice","positive","best","fantastic","excellent"}
+NEG_HINTS = {"bad","hate","dislike","worst","awful","terrible","negative","poor","meh","gross","unsafe","hard","difficult"}
+punct_re = re.compile(f"[{re.escape(string.punctuation)}]")
 def normalize(text: str) -> str:
+    return punct_re.sub(" ", (text or "").lower())
 def tokenize(text: str):
     return [t for t in normalize(text).split() if t and t not in STOPWORDS]
+def infer_sentiment(user_text: str) -> str:
+    tl = normalize(user_text)
+    has_pos = any(w in tl for w in POS_HINTS)
+    has_neg = any(w in tl for w in NEG_HINTS)
+    if has_pos and not has_neg:
+        return "positive"
+    if has_neg and not has_pos:
+        return "negative"
+    return "positive"  # default
 # -----------------------------
+# Retrieval with fuzzy matching
 # -----------------------------
+def best_match_quote(category: str, sentiment: str, user_text: str) -> str:
     if category not in QUOTES:
+        return f"No quotes found for category '{category}'."
+    pool = QUOTES[category]
+    if not pool:
+        return f"No quotes available in '{category}'."
+    q_tokens = set(tokenize(user_text))
+    best_score = -1
+    best_quote = None
+    for entry in pool:
+        qtoks = set(tokenize(entry.get("quote", "")))
+        score = len(q_tokens & qtoks)
+        # fuzzy matching fallback
+        if score == 0:
+            for word in q_tokens:
+                matches = difflib.get_close_matches(word, qtoks, n=1, cutoff=0.8)
+                if matches:
+                    score += 1
+        if score > best_score:
+            best_score = score
+            best_quote = entry.get("quote", "")
+    if not best_quote:
+        return random.choice([e.get("quote","") for e in pool if "quote" in e])
+    return best_quote
+# -----------------------------
+# Gradio callbacks
+# -----------------------------
+def respond(message, history, category, sentiment_choice):
     if not QUOTES:
+        bot = "No dataset loaded. Please upload a JSON file first."
+        history.append((message, bot))
+        return "", history
     if not category:
         bot = "Please select a category."
+        history.append((message, bot))
+        return "", history
+    # sentiment not really used with interview-style data, but kept for compatibility
+    if sentiment_choice == "auto":
+        sent = infer_sentiment(message)
+    else:
+        sent = sentiment_choice
+    bot = best_match_quote(category, sent, message)
+    history.append((message, bot))
+    return "", history
+def clear_chat():
+    return None
 def upload_json(filepath):
+    global QUOTES, DATA_PATH
     try:
         with open(filepath, "r", encoding="utf-8") as f:
             data = json.load(f)
+        if not isinstance(data, dict):
+            return gr.update(value="Upload failed: JSON root must be an object."), gr.update(choices=[])
+        if "staged_responses" not in data:
+            data["staged_responses"] = []
+        QUOTES = data
+        DATA_PATH = os.path.basename(filepath)
+        cats = sorted(list(QUOTES.keys()))
+        status = f"Loaded {len(cats)} categories from {DATA_PATH}."
+        return status, gr.update(choices=cats, value=(cats[0] if cats else None))
     except Exception as e:
         return f"Error loading file: {e}", gr.update(choices=[])
+def download_current():
+    """Download dataset including staged responses."""
+    out_name = DATA_PATH or "quotes_export.json"
+    try:
+        tmp = "quotes_export.json"
+        with open(tmp, "w", encoding="utf-8") as f:
+            json.dump(QUOTES, f, indent=2, ensure_ascii=False)
+        return tmp
+    except Exception:
+        return None
+def stage_conversation(history, category):
+    if not category:
+        return "Please select a category to stage into."
+    staged = QUOTES.get("staged_responses", [])
+    for msg, bot in history:
+        staged.append({
+            "category": category,
+            "user": msg,
+            "bot": bot
+        })
+    QUOTES["staged_responses"] = staged
+    return f"Staged {len(history)} exchanges into 'staged_responses'."
 # -----------------------------
+# UI
 # -----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## 🎓 College Life Chatbot — Category-Aware, Similarity Matching")
     initial_categories = sorted(list(QUOTES.keys()))
     with gr.Row():
+        category = gr.Dropdown(
+            label="Category",
+            choices=initial_categories,
+            value=(initial_categories[0] if initial_categories else None)
+        )
+        sentiment = gr.Dropdown(
+            label="Sentiment",
+            choices=["auto", "positive", "negative"],
+            value="auto"
+        )
     chatbot = gr.Chatbot(label="Conversation", height=360, type="tuples")
     msg = gr.Textbox(label="Your message", placeholder="Ask something like: 'Is food good in college?'", autofocus=True)
     send = gr.Button("Send")
     clear = gr.Button("Clear")
     with gr.Row():
+        stage_btn = gr.Button("Stage Conversation to Category")
         stage_status = gr.Textbox(label="Stage status", interactive=False)
     with gr.Row():
+        uploader = gr.File(label="Upload dataset (.json)", file_types=[".json"], type="filepath")
+        upload_status = gr.Textbox(label="Upload status", interactive=False)
+        downloader = gr.File(label="Download current dataset")
+    # Wire events
+    msg.submit(respond, [msg, chatbot, category, sentiment], [msg, chatbot])
+    send.click(respond, [msg, chatbot, category, sentiment], [msg, chatbot])
+    clear.click(clear_chat, None, chatbot, queue=False)
+    stage_btn.click(stage_conversation, [chatbot, category], stage_status)
+    uploader.upload(upload_json, uploader, [upload_status, category])
+    downloader.download(download_current)
 # -----------------------------
 # Startup log
 # -----------------------------
 print(f"===== Application Startup at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")
 if QUOTES:
+    for cat, items in QUOTES.items():
+        if isinstance(items, list):
+            print(f" - {cat}: {len(items)} entries")