Spaces:

orachamp1981
/

oracle-llm

Sleeping

App Files Files Community

orachamp1981 commited on Jul 9, 2025

Commit

641c418

verified ·

1 Parent(s): bc655a0

Upload 12 files

Browse files

Files changed (11) hide show

app.py +101 -90
concept_library.json +8 -0
data_loader.py +203 -10
diagram_generator.py +22 -0
model.py +190 -193
query_generator.py +110 -0
requirements.txt +12 -5
semantic_tree.py +71 -0
tree_builder.py +6 -0
tree_cache.json +3 -0
tree_synthesizer.py +77 -0

app.py CHANGED Viewed

@@ -1,90 +1,101 @@
-# app.py
-import gradio as gr
-from model import oracle_sql_suggester
-def chat_fn(message, history):
-    response = oracle_sql_suggester(message)
-    history = history or []
-    history.append((message, response))
-    return history, ""
-def retry_last(history):
-    if history:
-        last_user_msg = history[-1][0]
-        response = oracle_sql_suggester(last_user_msg)
-        history[-1] = (last_user_msg, response)
-    return history
-def undo_last(history):
-    if history:
-        history.pop()
-    return history
-with gr.Blocks(
-    css="""
-    body, html, .gradio-container {
-        background-color: #121212 !important;
-        color: #ffffff !important;
-        font-family: 'Times New Roman', serif !important;
-    }
-    .gr-chatbot {
-        background-color: #1e1e2f !important;
-        color: #ffffff !important;
-    }
-    .gr-button, .gr-textbox textarea {
-        font-family: 'Times New Roman', serif !important;
-    }
-    .message.user {
-        background-color: #2a2a3b !important;
-        color: #e0e0e0 !important;
-    }
-    .message.bot {
-        background-color: #333344 !important;
-        color: #ffffff !important;
-    }
-    .gr-input, .gr-textbox, textarea {
-        background-color: #2a2a2a !important;
-        color: #ffffff !important;
-        font-family: 'Times New Roman', serif !important;
-    }
-    #chatbox-style {
-    background-color: #ffffff !important;  /* white background */
-    color: #000000 !important;             /* black text */
-    font-family: "Times New Roman", serif;
-    }
-    #chatbox-style .message.bot {
-    background-color: #f5f5f5 !important;  /* light gray for bot bubbles */
-    color: #000000 !important;
-    }
-    #chatbox-style .message.user {
-    background-color: #e0e0e0 !important;  /* light gray for user bubbles */
-    color: #000000 !important;
-    }
-    """
-) as demo:
-    gr.Markdown("<h2 style='color: #ffffff; font-family: Times New Roman; text-align: center;'>🧠 Oracle SQL and PL/SQL Assistant</h2>")
-    chatbot = gr.Chatbot(show_copy_button=True, height=450, elem_id="chatbox-style")
-    with gr.Row():
-        txt = gr.Textbox(placeholder="Type your SQL or PL/SQL question here...", lines=2, scale=8)
-        submit_btn = gr.Button("➡️ Submit", scale=1)
-        retry_btn = gr.Button("🔁 Retry", scale=1)
-        undo_btn = gr.Button("↩️ Undo", scale=1)
-        clear_btn = gr.Button("🧹 Clear", scale=1)
-    submit_btn.click(chat_fn, [txt, chatbot], [chatbot, txt])
-    txt.submit(chat_fn, [txt, chatbot], [chatbot, txt])
-    retry_btn.click(retry_last, [chatbot], [chatbot])
-    undo_btn.click(undo_last, [chatbot], [chatbot])
-    clear_btn.click(lambda: [], None, chatbot)
-demo.launch()

+# --- PATCHED app.py ---
+import gradio as gr
+from model import oracle_sql_suggester
+def chat_fn(message, history):
+    response = oracle_sql_suggester(message)
+    history = history or []
+    history.append({"role": "user", "content": message})
+    # ✅ Debug print for safety
+    print("🛠️ Assistant Response:", repr(response))
+    if isinstance(response, list):
+        for item in response:
+            history.append(item)
+    else:
+        # ✅ Ensure the response is always a string
+        if not isinstance(response, str):
+            response = str(response)
+        history.append({"role": "assistant", "content": response})
+    return history, ""
+def retry_last(history):
+    if history:
+        last_user_msg = history[-1][0]
+        response = oracle_sql_suggester(last_user_msg)
+        history[-1] = (last_user_msg, response)
+    return history
+def undo_last(history):
+    if history:
+        history.pop()
+    return history
+def process_upload(file):
+    from data_loader import load_prompts_from_file
+    path = file.name
+    data = load_prompts_from_file(path)
+    return f"✅ Uploaded {len(data)} prompt pairs!"
+with gr.Blocks(
+    css="""
+    body, html, .gradio-container {
+        background-color: #121212 !important;
+        color: #ffffff !important;
+        font-family: 'Times New Roman', serif !important;
+    }
+    .gr-chatbot {
+        background-color: #1e1e2f !important;
+        color: #ffffff !important;
+    }
+    .gr-button, .gr-textbox textarea {
+        font-family: 'Times New Roman', serif !important;
+    }
+    .message.user {
+        background-color: #2a2a3b !important;
+        color: #e0e0e0 !important;
+    }
+    .message.bot {
+        background-color: #333344 !important;
+        color: #ffffff !important;
+    }
+    .gr-input, .gr-textbox, textarea {
+        background-color: #2a2a2a !important;
+        color: #ffffff !important;
+        font-family: 'Times New Roman', serif !important;
+    }
+    #chatbox-style {
+        background-color: #ffffff !important;
+        color: #000000 !important;
+        font-family: "Times New Roman", serif;
+    }
+    #chatbox-style .message.bot {
+        background-color: #f5f5f5 !important;
+        color: #000000 !important;
+    }
+    #chatbox-style .message.user {
+        background-color: #e0e0e0 !important;
+        color: #000000 !important;
+    }
+    """
+) as demo:
+    gr.Markdown("<h2 style='color: #ffffff; font-family: Times New Roman; text-align: center;'>🧠 Oracle SQL and PL/SQL Assistant</h2>")
+    chatbot = gr.Chatbot(show_copy_button=True, height=450, elem_id="chatbox-style", type="messages")
+    with gr.Row():
+        txt = gr.Textbox(placeholder="Type your SQL or PL/SQL question here...", lines=2, scale=8)
+        submit_btn = gr.Button("➡️ Submit", scale=1)
+        retry_btn = gr.Button("🔁 Retry", scale=1)
+        undo_btn = gr.Button("↩️ Undo", scale=1)
+        clear_btn = gr.Button("🧹 Clear", scale=1)
+    submit_btn.click(chat_fn, [txt, chatbot], [chatbot, txt])
+    txt.submit(chat_fn, [txt, chatbot], [chatbot, txt])
+    retry_btn.click(retry_last, [chatbot], [chatbot])
+    undo_btn.click(undo_last, [chatbot], [chatbot])
+    clear_btn.click(lambda: [], None, chatbot)
+demo.launch()

concept_library.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "project lifecycle": ["Initiation", "Planning", "Execution", "Closure"],
+  "approval flow": ["Request", "Review", "Decision", "Notification"],
+  "inventory tracking": ["Stock In", "Stock Out", "Audit", "Reorder"],
+  "recruitment process": ["Job Posting", "Screening", "Interview", "Offer Letter"],
+  "onboarding process": ["Document Collection",  "Induction",  "System Setup",   "Training",   "Access Provision"]
+}

data_loader.py CHANGED Viewed

@@ -1,22 +1,35 @@
-# data_loader.py
 import os
 import codecs
 def clean_sql_output(raw_text):
     try:
-        # Decode escaped characters like \\n to real \n
         decoded = codecs.decode(raw_text.strip(), 'unicode_escape')
-        # Clean up formatting
-        return (
-            decoded.replace(";;", ";")
-                   .replace("\n\n\n", "\n\n")  # In case of extra breaks
-                   .strip()
-        )
     except Exception as e:
-        print("⚠️ Cleaning error:", e)
         return raw_text.strip()
 def load_rules(file_path="data/train_data.txt"):
     data = {}
     if os.path.exists(file_path):
@@ -27,6 +40,7 @@ def load_rules(file_path="data/train_data.txt"):
                     data[key.strip().lower()] = clean_sql_output(value)
     return data
 def detect_domain(prompt):
     prompt = prompt.lower()
     if any(word in prompt for word in ["salary", "financial", "transaction", "ledger"]):
@@ -44,4 +58,183 @@ def load_rules_by_domain(prompt):
         domain_rules = load_rules(domain_file)
         if prompt in domain_rules:
             return domain_rules[prompt]
-    return None  # fallback will be handled in main logic

+#data_loader
 import os
+import re
+import json
+import csv
 import codecs
+import requests
+import PyPDF2
+from docx import Document
+import openpyxl
+from bs4 import BeautifulSoup
+# ? Normalize utility
+def normalize_prompt(text):
+    text = text.strip().lower()
+    text = re.sub(r"[;:,.!?]+$", "", text)
+    text = re.sub(r"[^\w\s]", "", text)
+    #text = re.sub(r"\b(what|actually|please|tell|me|about|can|you|explain|is|the|do)\b", "", text)
+    text = re.sub(r"\b(what|actually|please|tell|me|about|can|you|explain|is|the|do|does|give|show)\b(?!\w)", "", text)
+    return re.sub(r"\s+", " ", text).strip()
+# ? Output cleanup for SQL responses
 def clean_sql_output(raw_text):
     try:
         decoded = codecs.decode(raw_text.strip(), 'unicode_escape')
+        return decoded.replace(";;", ";").replace("\n\n\n", "\n\n").strip()
     except Exception as e:
+        print("?? Cleaning error:", e)
         return raw_text.strip()
+# ? Existing basic rule loader
 def load_rules(file_path="data/train_data.txt"):
     data = {}
     if os.path.exists(file_path):
                     data[key.strip().lower()] = clean_sql_output(value)
     return data
+# ? Domain routing logic
 def detect_domain(prompt):
     prompt = prompt.lower()
     if any(word in prompt for word in ["salary", "financial", "transaction", "ledger"]):
         domain_rules = load_rules(domain_file)
         if prompt in domain_rules:
             return domain_rules[prompt]
+    return None
+# ? Extended loaders for structured files
+def load_txt(path):
+    pairs = []
+    with open(path, 'r', encoding='utf-8') as f:
+        for line in f:
+            if '=' in line:
+                prompt, answer = line.split('=', 1)
+                pairs.append((normalize_prompt(prompt), answer.strip()))
+    return pairs
+def load_json(path):
+    pairs = []
+    with open(path, 'r', encoding='utf-8') as f:
+        for entry in json.load(f):
+            pairs.append((normalize_prompt(entry['prompt']), entry['answer'].strip()))
+    return pairs
+def load_csv(path):
+    pairs = []
+    with open(path, newline='', encoding='utf-8') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            if 'prompt' in row and 'answer' in row:
+                pairs.append((normalize_prompt(row['prompt']), row['answer'].strip()))
+    return pairs
+def load_pdf(path):
+    pairs = []
+    with open(path, 'rb') as f:
+        reader = PyPDF2.PdfReader(f)
+        text = "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])
+        for line in text.split("\n"):
+            if '=' in line:
+                prompt, answer = line.split('=', 1)
+                pairs.append((normalize_prompt(prompt), answer.strip()))
+    return pairs
+def load_docx(path):
+    pairs = []
+    doc = Document(path)
+    for para in doc.paragraphs:
+        if "=" in para.text:
+            prompt, answer = para.text.split("=", 1)
+            pairs.append((normalize_prompt(prompt), answer.strip()))
+    return pairs
+def load_xlsx(path):
+    pairs = []
+    wb = openpyxl.load_workbook(path)
+    for sheet in wb.worksheets:
+        for row in sheet.iter_rows(values_only=True):
+            if not row or len(row) < 2:
+                continue
+            prompt, answer = row[0], row[1]
+            if isinstance(prompt, str) and isinstance(answer, str) and "=" not in prompt:
+                pairs.append((normalize_prompt(prompt), answer.strip()))
+            elif isinstance(prompt, str) and "=" in prompt:
+                p, a = prompt.split("=", 1)
+                pairs.append((normalize_prompt(p), a.strip()))
+    return pairs
+# ? Load from GitHub/HuggingFace (TXT/JSON)
+def fetch_text_from_url(url):
+    try:
+        resp = requests.get(url, timeout=10)
+        resp.raise_for_status()
+        return resp.text
+    except Exception as e:
+        print(f"?? Error reading remote file {url}: {e}")
+        return ""
+# ? Dispatcher for local files
+def load_prompts_from_file(path):
+    if path.endswith('.txt'):
+        return load_txt(path)
+    elif path.endswith('.json'):
+        return load_json(path)
+    elif path.endswith('.csv'):
+        return load_csv(path)
+    elif path.endswith('.pdf'):
+        return load_pdf(path)
+    elif path.endswith('.docx'):
+        return load_docx(path)
+    elif path.endswith('.xlsx'):
+        return load_xlsx(path)
+    else:
+        print(f"? Unsupported format: {path}")
+        return []
+def load_prompts_from_url(url):
+    pairs = []
+    text = fetch_text_from_url(url)
+    if not text:
+        return []
+    if url.endswith(".txt"):
+        for line in text.splitlines():
+            if '=' in line:
+                prompt, answer = line.split('=', 1)
+                pairs.append((normalize_prompt(prompt), answer.strip()))
+    elif url.endswith(".json"):
+        try:
+            data = json.loads(text)
+            for entry in data:
+                pairs.append((normalize_prompt(entry['prompt']), entry['answer'].strip()))
+        except Exception as e:
+            print(f"?? JSON parsing failed: {e}")
+    return pairs
+def load_prompt_pairs(path):
+    import json, csv
+    import requests
+    import io
+    import PyPDF2
+    def is_url(p): return p.startswith("http")
+    ext = path.split(".")[-1].lower()
+    data = []
+    if is_url(path):
+        response = requests.get(path)
+        response.raise_for_status()
+        content = response.content
+        if ext == "json":
+            parsed = json.loads(content.decode("utf-8"))
+            for entry in parsed:
+                data.append((normalize_prompt(entry['prompt']), entry['answer'].strip()))
+        elif ext == "csv":
+            reader = csv.DictReader(io.StringIO(content.decode("utf-8")))
+            for row in reader:
+                data.append((normalize_prompt(row['prompt']), row['answer'].strip()))
+        elif ext == "txt":
+            for line in content.decode("utf-8", errors="replace").splitlines():
+                if "=" in line:
+                    p, a = line.split("=", 1)
+                    data.append((normalize_prompt(p), a.strip()))
+        elif ext == "pdf":
+            reader = PyPDF2.PdfReader(io.BytesIO(content))
+            for page in reader.pages:
+                text = page.extract_text()
+                if text:
+                    for line in text.splitlines():
+                        if "=" in line:
+                            p, a = line.split("=", 1)
+                            data.append((normalize_prompt(p), a.strip()))
+    else:
+        with open(path, "r", encoding="utf-8", errors="replace") as f:
+            lines = f.readlines()
+            for line in lines:
+                line = line.strip()
+                if "=" in line:
+                    p, a = line.split("=", 1)
+                    data.append((normalize_prompt(p), a.strip()))
+    return data
+def list_files_from_github_folder(github_folder_url):
+    try:
+        html = requests.get(github_folder_url).text
+        soup = BeautifulSoup(html, "lxml")
+        links = soup.select("a.js-navigation-open")
+        raw_base = github_folder_url.replace("github.com", "raw.githubusercontent.com").replace("/blob", "")
+        file_links = []
+        for link in links:
+            href = link.get("href", "")
+            if any(href.endswith(ext) for ext in [".txt", ".json", ".csv", ".pdf", ".docx", ".xlsx"]):
+                file_links.append(f"https://{raw_base.split('/', 2)[-1].split('/')[0]}/{href.split('/', 2)[-1]}")
+        return file_links
+    except Exception as e:
+        print("⚠️ GitHub scan error:", e)
+        return []

diagram_generator.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#diagram_generator.py
+import graphviz
+import os
+def generate_plsql_structure_chart(output_path="output/plsql_structure"):
+    dot = graphviz.Digraph(format="png")
+    dot.attr(rankdir='TB', bgcolor="lightyellow", fontname="Arial")
+    # Nodes
+    dot.node("START", "BEGIN", shape="oval", style="filled", fillcolor="lightgreen")
+    dot.node("DECLARE", "DECLARE", shape="box", style="filled", fillcolor="lightblue")
+    dot.node("EXCEPTION", "EXCEPTION", shape="box", style="filled", fillcolor="orange")
+    dot.node("END", "END", shape="oval", style="filled", fillcolor="lightgreen")
+    # Edges
+    dot.edge("START", "DECLARE")
+    dot.edge("DECLARE", "EXCEPTION")
+    dot.edge("EXCEPTION", "END")
+    output_file = dot.render(output_path, cleanup=True)
+    return output_file + ".png" # e.g., output/plsql_structure.png

model.py CHANGED Viewed

@@ -1,193 +1,190 @@
-# model.py
-import os
-import re
-import torch
-import random
-import requests
-from sentence_transformers import SentenceTransformer, util
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-from sql_templates import sql_templates, sql_keyword_aliases, fuzzy_aliases, conflicting_phrases, greeting_templates
-DATA_DIR = [
-    "data",  # local folder
-    "https://raw.githubusercontent.com/orachamp1981"  # GitHub base (MUST be raw)
-]
-DOMAIN_FILES = {
-    "sql": "sql.txt",
-    "plsql": "plsql.txt",
-    "oracle_forms": "oracle_forms.txt",
-    "oracle_reports": "oracle_reports.txt",
-    "sql_plsql_interview": "interview_sql_pl_sql_question.txt",
-    "pl_sql": "PL-SQL-Development/orachamp1981-patch-1/email_send.txt"
-}
-FALLBACK_FILE = "train_data.txt"
-# ✅ Semantic model
-model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
-# ✅ Normalize input
-def normalize_prompt(text):
-    text = text.strip().lower()
-    text = re.sub(r"[;:,.!?]+$", "", text)         # ⬅️ Remove trailing punctuation
-    text = re.sub(r"[^\w\s]", "", text)            # Remove all other punctuation
-    text = re.sub(r"\b(what|actually|please|tell|me|about|can|you|explain|is|the|do)\b", "", text)
-    return re.sub(r"\s+", " ", text).strip()
-# ✅ Output cleaner
-def clean_response(text):
-    return text.replace("\\n", "\n").replace(";;", ";").strip()
-# ✅ Load multi-prompt rules
-def load_multi_prompt_file(file_path):
-    items = []
-    if not os.path.exists(file_path):
-        return items
-    with open(file_path, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line or line.startswith("#") or "=" not in line:
-                continue
-            prompts_raw, answer = line.split("=", 1)
-            for p in prompts_raw.split(","):
-                norm = normalize_prompt(p)
-                if norm:
-                    items.append((norm, answer.strip()))
-    return items
-def load_multi_prompt_file_from_url(url):
-    items = []
-    try:
-        response = requests.get(url)
-        response.raise_for_status()
-        lines = response.text.splitlines()
-        for line in lines:
-            line = line.strip()
-            if not line or line.startswith("#") or "=" not in line:
-                continue
-            prompts_raw, answer = line.split("=", 1)
-            for p in prompts_raw.split(","):
-                norm = normalize_prompt(p)
-                if norm:
-                    items.append((norm, answer.strip()))
-    except Exception as e:
-        print(f"⚠️ Error reading {url}:", e)
-    return items
-# Build URL Building
-def is_url(path):
-    return path.startswith("http://") or path.startswith("https://")
-# ✅ Build embeddings from all files
-def load_all_embeddings():
-    all_data = []
-    for _, rel_path in DOMAIN_FILES.items():
-        file_loaded = False
-        for base in DATA_DIR:
-            if base.startswith("http"):
-                full_url = f"{base}/{rel_path}"
-                data = load_multi_prompt_file_from_url(full_url)
-            else:
-                full_path = os.path.join(base, rel_path)
-                data = load_multi_prompt_file(full_path)
-            if data:
-                all_data.extend(data)
-                file_loaded = True
-                break  # stop after first successful load
-        if not file_loaded:
-            print(f"⚠️ Could not load file: {rel_path}")
-    # Fallback train_data.txt (assumed local)
-    all_data.extend(load_multi_prompt_file(os.path.join("data", FALLBACK_FILE)))
-    if not all_data:
-        return [], None
-    prompts = [p[0] for p in all_data]
-    embeddings = model.encode(prompts, convert_to_tensor=True)
-    return all_data, embeddings
-ALL_PAIRS, ALL_EMBEDDINGS = load_all_embeddings()
-# 🤖 Local LLM
-llm_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-tokenizer = AutoTokenizer.from_pretrained(llm_name)
-llm_model = AutoModelForCausalLM.from_pretrained(
-    llm_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-)
-llm_pipeline = pipeline("text-generation", model=llm_model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
-# 🔍 Main suggestor function
-def oracle_sql_suggester(prompt):
-    norm_prompt = normalize_prompt(prompt)
-    # Step 1: Greeting or Conflicts
-    for greet_key, greet_reply in greeting_templates.items():
-        if greet_key in norm_prompt:
-            return greet_reply
-    for terms, response in conflicting_phrases.items():
-        if all(term in norm_prompt for term in terms):
-            return response
- # Step X: Simple acknowledgment replies
-    acknowledgment_prompts = ["okay", "ok", "got it", "thanks", "thank you", "cool", "alright", "great"]
-    acknowledgment_replies = [
-    "👍 Great! Let me know if you want to continue or explore another topic.",
-    "👌 Got it! I'm here if you need help with anything else.",
-    "✅ Understood. Feel free to ask the next question whenever you're ready.",
-    "Glad to hear that! Would you like to dive deeper or move on?",
-    "Perfect! Let me know what you'd like to explore next."
-]
-    if norm_prompt in acknowledgment_prompts:
-         return random.choice(acknowledgment_replies)
- # ✅ Step 1.5: Dynamic vague prompt detection
-    if len(norm_prompt.split()) <= 3:
-        user_embedding = model.encode(norm_prompt, convert_to_tensor=True)
-        if ALL_EMBEDDINGS is not None:
-            cosine_scores = util.cos_sim(user_embedding, ALL_EMBEDDINGS)[0]
-            top_score = torch.max(cosine_scores).item()
-            if top_score < 0.55:
-                return (
-                    "🤖 Your question seems a bit broad or unclear.\n\n"
-                    "Could you please clarify what you're asking?\n\n"
-                    "- Are you referring to a query structure, data model, or system design?\n"
-                    "- Is this related to SQL, PL/SQL, or business process?\n\n"
-                    "Once you confirm, I'll provide the best possible answer!"
-                )
-    # Step 2: Semantic match across all domains + fallback
-    if ALL_EMBEDDINGS is not None:
-        user_embedding = model.encode(norm_prompt, convert_to_tensor=True)
-        cosine_scores = util.cos_sim(user_embedding, ALL_EMBEDDINGS)[0]
-        best_idx = torch.argmax(cosine_scores).item()
-        best_score = cosine_scores[best_idx].item()
-        if best_score >= 0.6:
-            return clean_response(ALL_PAIRS[best_idx][1])
-    # Step 3: Template, fuzzy, keyword aliases
-    for word in norm_prompt.split():
-        if word in sql_keyword_aliases:
-            return sql_templates.get(sql_keyword_aliases[word])
-    for key, template in sql_templates.items():
-        if key in norm_prompt or key.replace("_", " ") in norm_prompt:
-            return template
-    for fuzzy_phrase, mapped_key in fuzzy_aliases.items():
-        if fuzzy_phrase in norm_prompt:
-            return sql_templates.get(mapped_key)
-    # Step 4: Fallback LLM
-    try:
-        prompt_text = f"Generate an Oracle SQL query or explanation for the following:\n{prompt}\n\nSQL:"
-        output = llm_pipeline(prompt_text, max_new_tokens=256, do_sample=True, temperature=0.5)[0]["generated_text"]
-        return "🤖 (LLM): " + output.split("SQL:")[-1].strip()
-    except Exception as e:
-        print("⚠️ LLM fallback error:", e)
-        return "🤖 Sorry, I couldn’t process that locally. Please try a simpler prompt."

+# --- PATCHED model.py ---
+import os
+import re
+import torch
+import gradio as gr
+import random
+from tree_builder import generate_tree_for_prompt
+from query_generator import generate_dynamic_query
+from sentence_transformers import SentenceTransformer, util
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from sql_templates import sql_templates, sql_keyword_aliases, fuzzy_aliases, conflicting_phrases, greeting_templates
+from data_loader import (
+    load_prompt_pairs,
+    load_prompts_from_file,
+    load_prompts_from_url,
+    normalize_prompt,
+    clean_sql_output,
+    detect_domain,
+    load_rules,
+    load_rules_by_domain
+)
+# ========== Load Resources ==========
+DATA_DIR = [
+    "data",
+    "https://raw.githubusercontent.com/orachamp1981/PL-SQL-Development/orachamp1981-patch-1"
+]
+DOMAIN_FILES = {
+    "sql": "sql.txt",
+    "plsql": "plsql.txt",
+    "sql_plsql_interview": "interview_sql_pl_sql_question.txt",
+    "pl_sql": "email_send.txt",
+    "faq": "oracle_faq.json",
+    "guides": "best_practices.pdf"
+}
+FALLBACK_FILE = "train_data.txt"
+model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
+# -- NEW: Sanitize broken unicode --
+def sanitize_unicode(text):
+    if isinstance(text, str):
+        return text.encode('utf-16', 'surrogatepass').decode('utf-16', 'replace')
+    return text
+def clean_response(text):
+    return text.replace("\\n", "\n").replace(";;", ";").strip()
+def is_definition_prompt(prompt: str) -> bool:
+    text = normalize_prompt(prompt)
+    return (
+        text.startswith("what is")
+        or text.startswith("define")
+        or "difference between" in text
+    )
+def is_dynamic_sql_prompt(prompt: str) -> bool:
+    text = normalize_prompt(prompt)
+    dynamic_keywords = ["tables", "schema", "design", "query", "join", "reports", "forms", "structure", "modules"]
+    return any(word in text for word in dynamic_keywords) and ("sql" in text or "module" in text or "table" in text)
+def load_all_embeddings():
+    all_data = []
+    failed_urls = set()
+    for _, rel_path in DOMAIN_FILES.items():
+        file_loaded = False
+        local_path = os.path.join("data", rel_path)
+        if os.path.exists(local_path):
+            data = load_prompts_from_file(local_path)
+            all_data.extend(data)
+            file_loaded = True
+        else:
+            for base in DATA_DIR:
+                if base.startswith("http"):
+                    full_url = f"{base}/{rel_path}"
+                    if full_url in failed_urls or full_url.endswith(".pdf"):
+                        continue
+                    data = load_prompts_from_url(full_url)
+                    if data:
+                        all_data.extend(data)
+                        file_loaded = True
+                        break
+                    else:
+                        failed_urls.add(full_url)
+        if not file_loaded:
+            print(f"⚠️ Could not load file: {rel_path}")
+    all_data.extend(load_prompts_from_file(os.path.join("data", FALLBACK_FILE)))
+    if not all_data:
+        return [], None
+    prompts = [p[0] for p in all_data]
+    embeddings = model.encode(prompts, convert_to_tensor=True)
+    return all_data, embeddings
+ALL_PAIRS, ALL_EMBEDDINGS = load_all_embeddings()
+# ========== Load LLM ==========
+llm_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+tokenizer = AutoTokenizer.from_pretrained(llm_name)
+llm_model = AutoModelForCausalLM.from_pretrained(
+    llm_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+)
+llm_pipeline = pipeline("text-generation", model=llm_model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
+# ========== Oracle SQL Assistant ==========
+def oracle_sql_suggester(prompt):
+    norm_prompt = normalize_prompt(prompt)
+    # ✅ Greeting fix
+    greeting_templates.update({
+        "how are you": "🤖 I'm just code, but thanks for asking! Ready to help with Oracle SQL or PL/SQL.",
+        "how r u": "🤖 I'm doing great in the cloud ☁️. Let's solve some SQL problems!"
+    })
+    for greet_key, greet_reply in greeting_templates.items():
+        if greet_key in norm_prompt:
+            return sanitize_unicode(greet_reply)
+    # ✅ Acknowledgement input
+    ack_inputs = ["okay", "ok", "got it", "thanks", "thank you", "cool", "alright", "great"]
+    ack_replies = [
+        "👍 Great! Let me know if you want to continue or explore another topic.",
+        "👌 Got it! I'm here if you need help with anything else.",
+        "✅ Understood. Feel free to ask the next question whenever you're ready.",
+        "Glad to hear that! Would you like to dive deeper or move on?",
+        "Perfect! Let me know what you'd like to explore next."
+    ]
+    if norm_prompt in ack_inputs:
+        return sanitize_unicode(random.choice(ack_replies))
+    # ✅ Conflict logic
+    for terms, response in conflicting_phrases.items():
+        if all(term in norm_prompt for term in terms):
+            return sanitize_unicode(response)
+    # ✅ Definitions
+    if is_definition_prompt(prompt):
+        if ALL_EMBEDDINGS is not None:
+            user_embedding = model.encode(norm_prompt, convert_to_tensor=True)
+            scores = util.cos_sim(user_embedding, ALL_EMBEDDINGS)[0]
+            best_idx = torch.argmax(scores).item()
+            best_score = scores[best_idx].item()
+            if best_score >= 0.6:
+                return sanitize_unicode(clean_response(ALL_PAIRS[best_idx][1]))
+        return sanitize_unicode("🤖 I couldn't find a strong match. Try rephrasing or ask something more specific.")
+    # ✅ Dynamic SQL
+    if is_dynamic_sql_prompt(prompt):
+        result = generate_dynamic_query(prompt)
+        if result:
+            return sanitize_unicode(f"🤖 (Dynamic SQL):\n{result}")
+    # ✅ Workflow tree
+    tree = generate_tree_for_prompt(prompt)
+    if tree:
+        return sanitize_unicode(str(tree))
+    # ✅ Semantic fallback
+    if ALL_EMBEDDINGS is not None:
+        user_embedding = model.encode(norm_prompt, convert_to_tensor=True)
+        scores = util.cos_sim(user_embedding, ALL_EMBEDDINGS)[0]
+        best_idx = torch.argmax(scores).item()
+        best_score = scores[best_idx].item()
+        if best_score >= 0.6:
+            return sanitize_unicode(clean_response(ALL_PAIRS[best_idx][1]))
+    # ✅ Templates
+    for word in norm_prompt.split():
+        if word in sql_keyword_aliases:
+            return sanitize_unicode(sql_templates.get(sql_keyword_aliases[word]))
+    for key, val in sql_templates.items():
+        if key in norm_prompt or key.replace("_", " ") in norm_prompt:
+            return sanitize_unicode(val)
+    for fuzzy, target_key in fuzzy_aliases.items():
+        if fuzzy in norm_prompt:
+            return sanitize_unicode(sql_templates.get(target_key))
+    # ✅ LLM fallback
+    try:
+        prompt_text = f"Generate an Oracle SQL query or explanation for the following:\n{prompt}\n\nSQL:"
+        output = llm_pipeline(prompt_text, max_new_tokens=256, do_sample=True, temperature=0.5)[0]["generated_text"]
+        return sanitize_unicode("🤖 (LLM): " + output.split("SQL:")[-1].strip())
+    except Exception as e:
+        print("⚠️ LLM fallback error:", e)
+        return sanitize_unicode("🤖 Sorry, I couldn’t process that locally. Please try a simpler prompt.")

query_generator.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# query_generator.py
+import re
+from sentence_transformers import SentenceTransformer, util
+model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
+# Sample domain entities
+MODULE_TABLES = {
+    "hr": ["employees", "departments", "payroll", "leave", "attendance"],
+    "payroll": ["payroll", "salary", "deductions", "benefits"],
+    "inventory": ["items", "stock", "suppliers", "reorder"],
+    "finance": ["accounts", "transactions", "expenses", "budget"],
+    "employee": ["employees", "leave", "payroll", "attendance"]
+}
+FORMS_REPORTS = {
+    "hr": ["employee_form", "leave_application", "recruitment_form"],
+    "payroll": ["salary_report", "deduction_report", "payslip_form"],
+    "inventory": ["stock_report", "item_entry_form"],
+    "finance": ["budget_report", "expense_form"]
+}
+def normalize(text):
+    return re.sub(r"[^a-zA-Z0-9\s]", "", text.lower()).strip()
+def get_best_match(prompt, candidates):
+    prompt_emb = model.encode(prompt, convert_to_tensor=True)
+    cand_embs = model.encode(candidates, convert_to_tensor=True)
+    sims = util.cos_sim(prompt_emb, cand_embs)[0]
+    top_index = sims.argmax().item()
+    return candidates[top_index], sims[top_index].item()
+def extract_module_from_prompt(prompt):
+    prompt_norm = normalize(prompt)
+    for module in MODULE_TABLES:
+        if module in prompt_norm:
+            return module
+    match, score = get_best_match(prompt_norm, list(MODULE_TABLES.keys()))
+    return match if score >= 0.4 else None
+def infer_fields_from_prompt(prompt):
+    prompt = normalize(prompt)
+    fields = []
+    if "name" in prompt: fields.append("emp_name")
+    if "salary" in prompt: fields.append("salary")
+    if "leave" in prompt: fields.append("leave_days")
+    if "department" in prompt: fields.append("dept_name")
+    if "id" in prompt: fields.append("emp_id")
+    return fields or ["*"]
+def generate_join_query(module, fields):
+    tables = MODULE_TABLES.get(module, [])
+    if not tables:
+        return "SELECT * FROM some_table"
+    select_parts = []
+    joins = []
+    base_table = tables[0]
+    for table in tables:
+        if fields == ["*"]:
+            select_parts.append(f"{table}.*")
+        else:
+            select_parts.extend([f"{table}.{f}" for f in fields])
+    for t in tables[1:]:
+        if "dept" in t:
+            joins.append(f"JOIN {t} ON {base_table}.dept_id = {t}.dept_id")
+        elif "payroll" in t:
+            joins.append(f"JOIN {t} ON {base_table}.emp_id = {t}.emp_id")
+        elif "leave" in t:
+            joins.append(f"JOIN {t} ON {base_table}.emp_id = {t}.emp_id")
+        else:
+            joins.append(f"JOIN {t} ON 1=1")
+    query = f"SELECT {', '.join(select_parts)} FROM {base_table} " + " ".join(joins)
+    return query
+def generate_forms_reports_query(module):
+    reports = FORMS_REPORTS.get(module, [])
+    if not reports:
+        return f"-- No forms or reports found for module '{module}'"
+    select_lines = [f"SELECT * FROM {r}" for r in reports]
+    return "\nUNION ALL\n".join(select_lines)
+def generate_dynamic_query(prompt):
+    prompt_norm = normalize(prompt)
+    if any(kw in prompt_norm for kw in ["form", "report", "forms", "reports"]):
+        module = extract_module_from_prompt(prompt)
+        return "🤖 (Dynamic SQL):\n" + generate_forms_reports_query(module)
+    elif any(kw in prompt_norm for kw in ["table", "tables", "schema", "structure", "design"]):
+        module = extract_module_from_prompt(prompt)
+        fields = infer_fields_from_prompt(prompt)
+        return "🤖 (Dynamic SQL):\n" + generate_join_query(module, fields)
+    elif "join" in prompt_norm:
+        module = extract_module_from_prompt(prompt)
+        fields = infer_fields_from_prompt(prompt)
+        return "🤖 (Dynamic SQL):\n" + generate_join_query(module, fields)
+    elif any(kw in prompt_norm for kw in ["query", "select", "show", "get", "fetch"]):
+        module = extract_module_from_prompt(prompt)
+        fields = infer_fields_from_prompt(prompt)
+        return "🤖 (Dynamic SQL):\n" + generate_join_query(module, fields)
+    return None  # let model.py handle fallback if not match

requirements.txt CHANGED Viewed

@@ -1,5 +1,12 @@
-gradio==3.50.2
-sentence-transformers
-torch
-transformers
-accelerate

+# REQUIREMENT
+gradio>=4.14.0
+gradio
+torch
+sentence-transformers
+transformers
+accelerate
+graphviz
+PyPDF2
+python-docx
+openpyxl

semantic_tree.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# semantic_tree.py
+import os
+import json
+import re
+from sentence_transformers import SentenceTransformer, util
+# Load embedding model
+model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
+# Path to concept memory file
+CONCEPT_DB = os.path.join(os.path.dirname(__file__), "concept_library.json")
+# Load or initialize concept memory
+if os.path.exists(CONCEPT_DB):
+    with open(CONCEPT_DB, "r", encoding="utf-8") as f:
+        concept_map = json.load(f)
+else:
+    concept_map = {}
+# Utility: Format dictionary as tree
+def format_tree(title, tree_dict):
+    output = [title]
+    parents = list(tree_dict.keys())
+    for i, parent in enumerate(parents):
+        is_last = (i == len(parents) - 1)
+        output.append(f"{'└──' if is_last else '├──'} {parent}")
+        children = tree_dict[parent]
+        for j, child in enumerate(children):
+            is_last_child = (j == len(children) - 1)
+            prefix = "    " if is_last else "│   "
+            output.append(f"{prefix}{'└──' if is_last_child else '├──'} {child}")
+    return "\n".join(output)
+# Extract base concept phrases
+def normalize_prompt(prompt):
+    return re.sub(r"[^a-zA-Z0-9\s]", "", prompt.lower()).strip()
+# Semantic search over known concepts
+def match_concept(prompt, concept_map, threshold=0.65):
+    if not concept_map:
+        return None
+    prompt_embed = model.encode(prompt, convert_to_tensor=True)
+    keys = list(concept_map.keys())
+    key_embeds = model.encode(keys, convert_to_tensor=True)
+    sims = util.cos_sim(prompt_embed, key_embeds)[0]
+    best_idx = sims.argmax().item()
+    if sims[best_idx] >= threshold:
+        return keys[best_idx]
+    return None
+# Generate tree from prompt meaning
+def generate_semantic_tree(prompt):
+    norm_prompt = normalize_prompt(prompt)
+    match = match_concept(norm_prompt, concept_map)
+    if match:
+        return format_tree(match.title(), {match.title(): concept_map[match]})
+    # No known match: fallback clustering logic (primitive expansion)
+    keywords = [w for w in norm_prompt.split() if len(w) > 2]
+    root = "Inferred Structure"
+    grouped = {
+        "Concept A": keywords[:len(keywords)//2 or 1],
+        "Concept B": keywords[len(keywords)//2 or 1:]
+    }
+    return format_tree(root, grouped)
+# Optional: expand concept map dynamically
+# Could be added later to "learn" from corrections

tree_builder.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# tree_builder.py
+from tree_synthesizer import generate_step_tree
+def generate_tree_for_prompt(prompt: str) -> str:
+    return generate_step_tree(prompt)

tree_cache.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "workflow of inventory tracking": "You're an expert at designing hierarchies and tree structures.\nYour task is to create a clear, non-repetitive, 2\u20133 level tree-style diagram for the following topic:\n\"workflow of inventory tracking\"\nFormatting rules:\n- Use \"\u251c\u2500\u2500\" for branches and \"\u2514\u2500\u2500\" for final child.\n- Do NOT repeat the topic name in multiple branches.\n- Do NOT output extra text like introductions or notes.\nExamples:\nERP System\n\u251c\u2500\u2500 Finance\n\u251c\u2500\u2500 Inventory\n\u251c\u2500\u2500 Sales\n\u2514\u2500\u2500 HR\nHospital Admissions\n\u251c\u2500\u2500 Patient Details\n\u2502   \u251c\u2500\u2500 Demographics\n\u2502   \u251c\u2500\u2500 Insurance\n\u2502   \u2514\u2500\u2500 Emergency Contact\n\u251c\u2500\u2500 Admission Records\n\u2514\u2500\u2500 Discharge Process\nNow generate the tree for:\nExpected Output:\n```\n\u2502   \u251c\u2500\u2500 Inventory\n\u2502   \u2502   \u251c\u2500\u2500 Stock Levels\n\u2502   \u2502   \u251c\u2500\u2500 Purchase Orders\n\u2502   \u2502   \u251c\u2500\u2500 Sales Orders\n\u2502   \u2502   \u2514\u2500\u2500 Inventory Reports\n\u2502   \u2514\u2500\u2500 Ledger\n\u2502       \u251c\u2500\u2500 Cash\n\u2502       \u251c\u2500\u2500 Bank Accounts\n\u2502       \u2514\u2500\u2500 Payroll\n\u2502   \u251c\u2500\u2500 Stock Levels\n\u2502   \u251c\u2500\u2500 Sales Orders\n\u2502   \u2502   \u251c\u2500\u2500 Inventory Reports\n\u2502   \u2502   \u2514\u2500\u2500 Sales Reports\n\u251c\u2500\u2500 HR\n\u2502   \u251c\u2500\u2500 Employee Data\n\u2502   \u2502   \u251c\u2500\u2500 Employee Handbook\n\u2502   \u2502   \u2514\u2500\u2500 Evaluation Forms\n\u2502   \u2514\u2500\u2500 Leave Requests\n\u2514\u2500\u2500 Admissions\n    \u251c\u2500\u2500 Ad"
+}

tree_synthesizer.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# tree_synthesizer.py
+import re
+from sentence_transformers import SentenceTransformer, util
+model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
+COMMON_STEPS = [
+    "Requirements Gathering", "Design", "Development", "Testing", "Deployment",
+    "Monitoring", "Approval", "Rejection", "Review", "Validation", "Planning",
+    "Execution", "Documentation", "Evaluation", "Feedback", "Support", "Training",
+    "Payment", "Notification", "Registration", "Submission", "Completion"
+]
+def normalize_prompt(text):
+    return re.sub(r"[^a-zA-Z0-9\s]", "", text.lower()).strip()
+def extract_domain(prompt):
+    tokens = normalize_prompt(prompt).split()
+    stopwords = {"workflow", "steps", "process", "structure", "of", "in", "the", "how", "does"}
+    return " ".join([w for w in tokens if w not in stopwords])
+def select_relevant_steps(prompt, top_k=5):
+    prompt_embedding = model.encode(prompt, convert_to_tensor=True)
+    step_embeddings = model.encode(COMMON_STEPS, convert_to_tensor=True)
+    sims = util.cos_sim(prompt_embedding, step_embeddings)[0]
+    top_indices = sims.argsort(descending=True)[:top_k]
+    return [COMMON_STEPS[i] for i in top_indices]
+def format_step_tree(title, steps):
+    output = [title.title()]
+    for i, step in enumerate(steps):
+        is_last = (i == len(steps) - 1)
+        output.append(f"{'└──' if is_last else '├──'} {step}")
+    return "\n".join(output)
+def is_definition_prompt(prompt):
+    text = normalize_prompt(prompt)
+    if text.startswith("what is") or text.startswith("define") or "difference between" in text:
+        return True
+    # Expanded direct checks for SQL/PLSQL terms even in short form
+    direct_terms = [
+        "sql", "plsql", "pl sql", "oracle sql", "commit", "rollback", "group by", "function"
+    ]
+    return any(term in text for term in direct_terms)
+def is_schema_design_prompt(prompt):
+    text = normalize_prompt(prompt)
+    design_keywords = [
+        "table", "tables", "schema", "structure", "data model", "sql tables",
+        "entity", "forms", "reports", "design", "layout"
+    ]
+    return any(word in text for word in design_keywords)
+def generate_step_tree(prompt):
+    text = normalize_prompt(prompt)
+    if len(text.split()) <= 2 and any(greet in text for greet in {"hi", "hello", "hey"}):
+        return "👋 Hi there! Need help with Oracle SQL or PL/SQL?"
+    if text.startswith("what is") and not any(term in text for term in {"workflow", "flow", "structure", "steps"}):
+        return ""
+    norm = normalize_prompt(prompt)
+    if len(norm.split()) <= 2:
+        return ""
+    if is_definition_prompt(prompt):
+        return ""
+    if is_schema_design_prompt(prompt):
+        return ""
+    domain = extract_domain(prompt).title()
+    if not domain:
+        domain = "Process"
+    steps = select_relevant_steps(prompt)
+    return format_step_tree(f"{domain} Workflow", steps)