Spaces:

NiviruIns
/

ai-commit-server

Sleeping

App Files Files Community

NiviruIns commited on Feb 4

Commit

cda6349

verified ·

1 Parent(s): 0857640

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -46

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import torch
 app = Flask(__name__)
-# --- MODEL SETUP ---
 MODEL_NAME = "SEBIS/code_trans_t5_base_commit_generation"
 print(f"--- AI Commit Generator Server ---")
 print(f"Downloading/Loading Model: {MODEL_NAME}")
@@ -23,7 +23,7 @@ except Exception as e:
 def preprocess_diff(diff_text):
     """
-    Cleans the diff to remove metadata and save token space.
     """
     if not diff_text:
         return ""
@@ -32,75 +32,76 @@ def preprocess_diff(diff_text):
     cleaned_lines = []
     for line in lines:
-        # We only care about changes
         if (line.startswith('+') or line.startswith('-')):
-            # Skip metadata +++ / ---
-            if line.startswith('+++') or line.startswith('---'):
-                continue
-            # Clean generic import lines which confuse the model
-            if "import " in line or "require(" in line:
-                continue
             cleaned_lines.append(line.strip())
     return "\n".join(cleaned_lines)
-def is_hallucination(summary, diff_text):
     """
-    Returns True if the summary contains known hallucination patterns.
     """
-    summary_lower = summary.lower()
-    # 1. Linguistic nonsense
-    forbidden_terms = [
-        "transitive verb", "intransitive verb", "adjective",
-        "noun", "pronoun", "metrics collection", "data volume"
-    ]
-    if any(term in summary_lower for term in forbidden_terms):
-        return True
-    # 2. Random Jira Tickets (e.g. STORM-123) that are NOT in the diff
-    ticket_pattern = re.compile(r'\b[A-Z]{2,}-\d+\b')
-    match = ticket_pattern.search(summary)
     if match:
         ticket = match.group()
         if ticket not in diff_text:
-            return True
-    return False
 def generate_summary(diff_text, filename):
-    # Preprocess
     cleaned_diff = preprocess_diff(diff_text)
-    # If the diff is just imports or too small, don't ask the AI
-    if not cleaned_diff or len(cleaned_diff) < 15:
         return f"Update {filename}"
-    # Tokenize
     input_ids = tokenizer.encode(cleaned_diff, return_tensors="pt", max_length=512, truncation=True).to(device)
     # Generate
     outputs = model.generate(
         input_ids,
-        max_length=50,         # Shorter max length to prevent rambling
-        min_length=3,
         num_beams=5,
         early_stopping=True
     )
-    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Validate Output
-    if is_hallucination(summary, diff_text):
-        print(f"⚠️ Hallucination caught: '{summary}' -> Reverting to default.")
-        return f"Update {filename} logic"
-    if not summary.strip():
-        return f"Modify {filename}"
-    return summary
 @app.route('/generate', methods=['POST'])
 def generate_commit():
@@ -116,9 +117,9 @@ def generate_commit():
         name = file_obj.get('name', 'file')
         diff = file_obj.get('diff', '')
-        # Guard for massive files
         if len(diff) > 12000:
-            final_message_parts.append(f"{name}\nLarge update (chunked)")
             continue
         try:
@@ -126,7 +127,7 @@ def generate_commit():
             final_message_parts.append(f"{name}\n{summary}")
         except Exception as e:
             print(f"Error processing {name}: {e}")
-            final_message_parts.append(f"{name}\nUpdate changes")
     return jsonify({"commit_message": "\n\n".join(final_message_parts)})

 app = Flask(__name__)
+# --- MODEL LOADING ---
 MODEL_NAME = "SEBIS/code_trans_t5_base_commit_generation"
 print(f"--- AI Commit Generator Server ---")
 print(f"Downloading/Loading Model: {MODEL_NAME}")
 def preprocess_diff(diff_text):
     """
+    Strips all metadata to ensure the model focuses ONLY on code changes.
     """
     if not diff_text:
         return ""
     cleaned_lines = []
     for line in lines:
+        # Keep only added (+) or removed (-) lines
         if (line.startswith('+') or line.startswith('-')):
+            # Remove metadata markers and noisy imports
+            if line.startswith('+++') or line.startswith('---'): continue
+            if "import " in line or "require(" in line: continue
+            if len(line.strip()) < 5: continue # Skip braces/empty lines
             cleaned_lines.append(line.strip())
     return "\n".join(cleaned_lines)
+def sanitize_summary(summary, diff_text, filename):
     """
+    The 'Scorched Earth' filter. If it smells like a hallucination, kill it.
     """
+    summary_clean = summary.strip()
+    # 1. Catch Jira Tickets (e.g., STORM-1404, JIRA - 123)
+    # The regex allows for optional spaces around the hyphen
+    ticket_pattern = re.compile(r'\b[A-Z]{3,}\s?-\s?\d+\b')
+    match = ticket_pattern.search(summary_clean)
     if match:
         ticket = match.group()
+        # If this exact ticket string isn't in the source code, it's fake.
         if ticket not in diff_text:
+            print(f"⚠️ Hallucination Killed: '{ticket}' in '{filename}'")
+            return f"Update {filename}"
+    # 2. Catch Linguistic Nonsense
+    forbidden_words = [
+        "transitive verb", "intransitive", "adjective",
+        "CHANGELOG", "readme", "documentation"
+    ]
+    # Only block "CHANGELOG" if the file itself isn't a changelog
+    if "changelog" not in filename.lower():
+        for word in forbidden_words:
+            if word in summary_clean.lower():
+                print(f"⚠️ Nonsense Killed: '{word}' in '{filename}'")
+                return f"Update {filename} logic"
+    return summary_clean
 def generate_summary(diff_text, filename):
+    # Aggressively clean the input
     cleaned_diff = preprocess_diff(diff_text)
+    # If the diff is too small (e.g., just whitespace), skip the AI
+    if not cleaned_diff or len(cleaned_diff) < 20:
         return f"Update {filename}"
+    # Encode
     input_ids = tokenizer.encode(cleaned_diff, return_tensors="pt", max_length=512, truncation=True).to(device)
     # Generate
     outputs = model.generate(
         input_ids,
+        max_length=60,
+        min_length=5,
         num_beams=5,
         early_stopping=True
     )
+    raw_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Apply the Sanitizer
+    final_summary = sanitize_summary(raw_summary, diff_text, filename)
+    return final_summary
 @app.route('/generate', methods=['POST'])
 def generate_commit():
         name = file_obj.get('name', 'file')
         diff = file_obj.get('diff', '')
+        # Hard limit on huge files
         if len(diff) > 12000:
+            final_message_parts.append(f"{name}\nUpdate large file (chunked)")
             continue
         try:
             final_message_parts.append(f"{name}\n{summary}")
         except Exception as e:
             print(f"Error processing {name}: {e}")
+            final_message_parts.append(f"{name}\nRefactor code")
     return jsonify({"commit_message": "\n\n".join(final_message_parts)})