Spaces:

NiviruIns
/

ai-commit-server

Sleeping

App Files Files Community

NiviruIns commited on Feb 4

Commit

0857640

verified ·

1 Parent(s): 7e3ddbd

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -40

app.py CHANGED Viewed

@@ -6,16 +6,14 @@ import torch
 app = Flask(__name__)
-# --- SWITCH TO THE EXPERT MODEL ---
 MODEL_NAME = "SEBIS/code_trans_t5_base_commit_generation"
 print(f"--- AI Commit Generator Server ---")
 print(f"Downloading/Loading Model: {MODEL_NAME}")
 device = "cpu"
 try:
-    # AutoTokenizer handles the specific needs of this model automatically
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, skip_special_tokens=True)
     model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
     print("✅ Model loaded successfully!")
@@ -25,7 +23,7 @@ except Exception as e:
 def preprocess_diff(diff_text):
     """
-    Aggressively cleans the diff to keep ONLY the changes.
     """
     if not diff_text:
         return ""
@@ -34,23 +32,51 @@ def preprocess_diff(diff_text):
     cleaned_lines = []
     for line in lines:
-        # Only keep lines that are actual additions/deletions
-        # checking length > 1 to avoid empty '+' or '-' lines
-        if (line.startswith('+') or line.startswith('-')) and len(line.strip()) > 1:
-            # Skip metadata lines starting with +++ or ---
             if line.startswith('+++') or line.startswith('---'):
                 continue
-            cleaned_lines.append(line)
     return "\n".join(cleaned_lines)
-def generate_summary(diff_text):
-    # Preprocess to get pure code changes
     cleaned_diff = preprocess_diff(diff_text)
-    # If cleaning removed everything (e.g., only whitespace changes), fallback
-    if not cleaned_diff or len(cleaned_diff.strip()) < 10:
-        return "Update logic"
     # Tokenize
     input_ids = tokenizer.encode(cleaned_diff, return_tensors="pt", max_length=512, truncation=True).to(device)
@@ -58,32 +84,21 @@ def generate_summary(diff_text):
     # Generate
     outputs = model.generate(
         input_ids,
-        max_length=80,
-        min_length=5,
         num_beams=5,
-        repetition_penalty=1.5,     # Increased penalty to stop loops
-        no_repeat_ngram_size=2,
         early_stopping=True
     )
     summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # --- HALLUCINATION GUARD ---
-    # Check for random Jira tickets (e.g., STORM-236, PROJ-123)
-    # Pattern: Uppercase letters, hyphen, numbers
-    ticket_pattern = re.compile(r'\b[A-Z]{2,}-\d+\b')
-    match = ticket_pattern.search(summary)
-    if match:
-        found_ticket = match.group()
-        # If the ticket ID is NOT in the source code, it's a hallucination
-        if found_ticket not in diff_text:
-            print(f"⚠️ Detected hallucination ({found_ticket}). Reverting to fallback.")
-            return "Refactor code and logic"
-    # Fallback if model yields empty string
     if not summary.strip():
-        return "Update logic"
     return summary
@@ -98,22 +113,20 @@ def generate_commit():
     final_message_parts = []
     for file_obj in files:
-        name = file_obj.get('name', 'Unknown File')
         diff = file_obj.get('diff', '')
-        print(f"[{name}] Length: {len(diff)}")
-        # Guard against massive files
         if len(diff) > 12000:
-            final_message_parts.append(f"{name}\nLarge changes detected (please commit in smaller chunks)")
             continue
         try:
-            summary = generate_summary(diff)
             final_message_parts.append(f"{name}\n{summary}")
         except Exception as e:
             print(f"Error processing {name}: {e}")
-            final_message_parts.append(f"{name}\nUpdate file")
     return jsonify({"commit_message": "\n\n".join(final_message_parts)})

 app = Flask(__name__)
+# --- MODEL SETUP ---
 MODEL_NAME = "SEBIS/code_trans_t5_base_commit_generation"
 print(f"--- AI Commit Generator Server ---")
 print(f"Downloading/Loading Model: {MODEL_NAME}")
 device = "cpu"
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, skip_special_tokens=True)
     model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
     print("✅ Model loaded successfully!")
 def preprocess_diff(diff_text):
     """
+    Cleans the diff to remove metadata and save token space.
     """
     if not diff_text:
         return ""
     cleaned_lines = []
     for line in lines:
+        # We only care about changes
+        if (line.startswith('+') or line.startswith('-')):
+            # Skip metadata +++ / ---
             if line.startswith('+++') or line.startswith('---'):
                 continue
+            # Clean generic import lines which confuse the model
+            if "import " in line or "require(" in line:
+                continue
+            cleaned_lines.append(line.strip())
     return "\n".join(cleaned_lines)
+def is_hallucination(summary, diff_text):
+    """
+    Returns True if the summary contains known hallucination patterns.
+    """
+    summary_lower = summary.lower()
+    # 1. Linguistic nonsense
+    forbidden_terms = [
+        "transitive verb", "intransitive verb", "adjective",
+        "noun", "pronoun", "metrics collection", "data volume"
+    ]
+    if any(term in summary_lower for term in forbidden_terms):
+        return True
+    # 2. Random Jira Tickets (e.g. STORM-123) that are NOT in the diff
+    ticket_pattern = re.compile(r'\b[A-Z]{2,}-\d+\b')
+    match = ticket_pattern.search(summary)
+    if match:
+        ticket = match.group()
+        if ticket not in diff_text:
+            return True
+    return False
+def generate_summary(diff_text, filename):
+    # Preprocess
     cleaned_diff = preprocess_diff(diff_text)
+    # If the diff is just imports or too small, don't ask the AI
+    if not cleaned_diff or len(cleaned_diff) < 15:
+        return f"Update {filename}"
     # Tokenize
     input_ids = tokenizer.encode(cleaned_diff, return_tensors="pt", max_length=512, truncation=True).to(device)
     # Generate
     outputs = model.generate(
         input_ids,
+        max_length=50,         # Shorter max length to prevent rambling
+        min_length=3,
         num_beams=5,
         early_stopping=True
     )
     summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Validate Output
+    if is_hallucination(summary, diff_text):
+        print(f"⚠️ Hallucination caught: '{summary}' -> Reverting to default.")
+        return f"Update {filename} logic"
     if not summary.strip():
+        return f"Modify {filename}"
     return summary
     final_message_parts = []
     for file_obj in files:
+        name = file_obj.get('name', 'file')
         diff = file_obj.get('diff', '')
+        # Guard for massive files
         if len(diff) > 12000:
+            final_message_parts.append(f"{name}\nLarge update (chunked)")
             continue
         try:
+            summary = generate_summary(diff, name)
             final_message_parts.append(f"{name}\n{summary}")
         except Exception as e:
             print(f"Error processing {name}: {e}")
+            final_message_parts.append(f"{name}\nUpdate changes")
     return jsonify({"commit_message": "\n\n".join(final_message_parts)})