Spaces:

NiviruIns
/

ai-commit-server

Sleeping

App Files Files Community

NiviruIns commited on Feb 4

Commit

7e3ddbd

verified ·

1 Parent(s): a052544

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -16

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ except Exception as e:
 def preprocess_diff(diff_text):
     """
-    Cleans the diff to remove git metadata and save token space for the actual code.
     """
     if not diff_text:
         return ""
@@ -34,41 +34,53 @@ def preprocess_diff(diff_text):
     cleaned_lines = []
     for line in lines:
-        # Remove git metadata lines
-        if line.startswith('diff --git') or line.startswith('index ') or line.startswith('+++') or line.startswith('---'):
-            continue
-        # Remove chunk headers like @@ -1,4 +1,5 @@
-        if line.startswith('@@'):
-            continue
-        cleaned_lines.append(line)
-    # Join and ensure we don't send an empty string
     return "\n".join(cleaned_lines)
 def generate_summary(diff_text):
     # Preprocess to get pure code changes
     cleaned_diff = preprocess_diff(diff_text)
-    if not cleaned_diff or len(cleaned_diff.strip()) < 5:
-        return "Update file"
     # Tokenize
     input_ids = tokenizer.encode(cleaned_diff, return_tensors="pt", max_length=512, truncation=True).to(device)
-    # Generate with better parameters to reduce "dumb" hallucinations
     outputs = model.generate(
         input_ids,
         max_length=80,
         min_length=5,
         num_beams=5,
-        repetition_penalty=1.2,    # Penalize repetition
-        no_repeat_ngram_size=2,    # Prevent repeating phrases
         early_stopping=True
     )
     summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Fallback if model yields empty string
     if not summary.strip():
         return "Update logic"
@@ -91,7 +103,7 @@ def generate_commit():
         print(f"[{name}] Length: {len(diff)}")
-        # Increased limit to 12,000 characters to handle larger updates
         if len(diff) > 12000:
             final_message_parts.append(f"{name}\nLarge changes detected (please commit in smaller chunks)")
             continue

 def preprocess_diff(diff_text):
     """
+    Aggressively cleans the diff to keep ONLY the changes.
     """
     if not diff_text:
         return ""
     cleaned_lines = []
     for line in lines:
+        # Only keep lines that are actual additions/deletions
+        # checking length > 1 to avoid empty '+' or '-' lines
+        if (line.startswith('+') or line.startswith('-')) and len(line.strip()) > 1:
+            # Skip metadata lines starting with +++ or ---
+            if line.startswith('+++') or line.startswith('---'):
+                continue
+            cleaned_lines.append(line)
     return "\n".join(cleaned_lines)
 def generate_summary(diff_text):
     # Preprocess to get pure code changes
     cleaned_diff = preprocess_diff(diff_text)
+    # If cleaning removed everything (e.g., only whitespace changes), fallback
+    if not cleaned_diff or len(cleaned_diff.strip()) < 10:
+        return "Update logic"
     # Tokenize
     input_ids = tokenizer.encode(cleaned_diff, return_tensors="pt", max_length=512, truncation=True).to(device)
+    # Generate
     outputs = model.generate(
         input_ids,
         max_length=80,
         min_length=5,
         num_beams=5,
+        repetition_penalty=1.5,     # Increased penalty to stop loops
+        no_repeat_ngram_size=2,
         early_stopping=True
     )
     summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # --- HALLUCINATION GUARD ---
+    # Check for random Jira tickets (e.g., STORM-236, PROJ-123)
+    # Pattern: Uppercase letters, hyphen, numbers
+    ticket_pattern = re.compile(r'\b[A-Z]{2,}-\d+\b')
+    match = ticket_pattern.search(summary)
+    if match:
+        found_ticket = match.group()
+        # If the ticket ID is NOT in the source code, it's a hallucination
+        if found_ticket not in diff_text:
+            print(f"⚠️ Detected hallucination ({found_ticket}). Reverting to fallback.")
+            return "Refactor code and logic"
     # Fallback if model yields empty string
     if not summary.strip():
         return "Update logic"
         print(f"[{name}] Length: {len(diff)}")
+        # Guard against massive files
         if len(diff) > 12000:
             final_message_parts.append(f"{name}\nLarge changes detected (please commit in smaller chunks)")
             continue