NiviruIns commited on
Commit
7e3ddbd
·
verified ·
1 Parent(s): a052544

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -16
app.py CHANGED
@@ -25,7 +25,7 @@ except Exception as e:
25
 
26
  def preprocess_diff(diff_text):
27
  """
28
- Cleans the diff to remove git metadata and save token space for the actual code.
29
  """
30
  if not diff_text:
31
  return ""
@@ -34,41 +34,53 @@ def preprocess_diff(diff_text):
34
  cleaned_lines = []
35
 
36
  for line in lines:
37
- # Remove git metadata lines
38
- if line.startswith('diff --git') or line.startswith('index ') or line.startswith('+++') or line.startswith('---'):
39
- continue
40
- # Remove chunk headers like @@ -1,4 +1,5 @@
41
- if line.startswith('@@'):
42
- continue
43
-
44
- cleaned_lines.append(line)
45
 
46
- # Join and ensure we don't send an empty string
47
  return "\n".join(cleaned_lines)
48
 
49
  def generate_summary(diff_text):
50
  # Preprocess to get pure code changes
51
  cleaned_diff = preprocess_diff(diff_text)
52
 
53
- if not cleaned_diff or len(cleaned_diff.strip()) < 5:
54
- return "Update file"
 
55
 
56
  # Tokenize
57
  input_ids = tokenizer.encode(cleaned_diff, return_tensors="pt", max_length=512, truncation=True).to(device)
58
 
59
- # Generate with better parameters to reduce "dumb" hallucinations
60
  outputs = model.generate(
61
  input_ids,
62
  max_length=80,
63
  min_length=5,
64
  num_beams=5,
65
- repetition_penalty=1.2, # Penalize repetition
66
- no_repeat_ngram_size=2, # Prevent repeating phrases
67
  early_stopping=True
68
  )
69
 
70
  summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  # Fallback if model yields empty string
73
  if not summary.strip():
74
  return "Update logic"
@@ -91,7 +103,7 @@ def generate_commit():
91
 
92
  print(f"[{name}] Length: {len(diff)}")
93
 
94
- # Increased limit to 12,000 characters to handle larger updates
95
  if len(diff) > 12000:
96
  final_message_parts.append(f"{name}\nLarge changes detected (please commit in smaller chunks)")
97
  continue
 
25
 
26
  def preprocess_diff(diff_text):
27
  """
28
+ Aggressively cleans the diff to keep ONLY the changes.
29
  """
30
  if not diff_text:
31
  return ""
 
34
  cleaned_lines = []
35
 
36
  for line in lines:
37
+ # Only keep lines that are actual additions/deletions
38
+ # checking length > 1 to avoid empty '+' or '-' lines
39
+ if (line.startswith('+') or line.startswith('-')) and len(line.strip()) > 1:
40
+ # Skip metadata lines starting with +++ or ---
41
+ if line.startswith('+++') or line.startswith('---'):
42
+ continue
43
+ cleaned_lines.append(line)
 
44
 
 
45
  return "\n".join(cleaned_lines)
46
 
47
  def generate_summary(diff_text):
48
  # Preprocess to get pure code changes
49
  cleaned_diff = preprocess_diff(diff_text)
50
 
51
+ # If cleaning removed everything (e.g., only whitespace changes), fallback
52
+ if not cleaned_diff or len(cleaned_diff.strip()) < 10:
53
+ return "Update logic"
54
 
55
  # Tokenize
56
  input_ids = tokenizer.encode(cleaned_diff, return_tensors="pt", max_length=512, truncation=True).to(device)
57
 
58
+ # Generate
59
  outputs = model.generate(
60
  input_ids,
61
  max_length=80,
62
  min_length=5,
63
  num_beams=5,
64
+ repetition_penalty=1.5, # Increased penalty to stop loops
65
+ no_repeat_ngram_size=2,
66
  early_stopping=True
67
  )
68
 
69
  summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
70
 
71
+ # --- HALLUCINATION GUARD ---
72
+ # Check for random Jira tickets (e.g., STORM-236, PROJ-123)
73
+ # Pattern: Uppercase letters, hyphen, numbers
74
+ ticket_pattern = re.compile(r'\b[A-Z]{2,}-\d+\b')
75
+ match = ticket_pattern.search(summary)
76
+
77
+ if match:
78
+ found_ticket = match.group()
79
+ # If the ticket ID is NOT in the source code, it's a hallucination
80
+ if found_ticket not in diff_text:
81
+ print(f"⚠️ Detected hallucination ({found_ticket}). Reverting to fallback.")
82
+ return "Refactor code and logic"
83
+
84
  # Fallback if model yields empty string
85
  if not summary.strip():
86
  return "Update logic"
 
103
 
104
  print(f"[{name}] Length: {len(diff)}")
105
 
106
+ # Guard against massive files
107
  if len(diff) > 12000:
108
  final_message_parts.append(f"{name}\nLarge changes detected (please commit in smaller chunks)")
109
  continue