GranularFireplace commited on
Commit
8c62a0b
·
verified ·
1 Parent(s): 305adbb

Fix yara preprocessing from overwriting

Browse files
Files changed (1) hide show
  1. app.py +41 -17
app.py CHANGED
@@ -74,36 +74,60 @@ def preprocess_yara_rules(repo_path: Path) -> Path:
74
  shutil.rmtree(processed_dir)
75
  processed_dir.mkdir()
76
 
77
- rule_pattern = re.compile(r"rule\s+(\w+)") # Match YARA rule names
78
- seen_rules = {} # Track rule names to prevent duplicates
79
- rule_counter = 0 # Counter for renaming duplicate rules
80
 
81
  for yara_file in repo_path.glob("**/*.yara"):
82
  if yara_file.name == "misc.yara":
83
  logger.info(f"Skipping {yara_file} as it does not belong to any malware family")
84
  continue
 
 
 
 
 
 
 
 
 
 
85
  new_content = []
86
- with open(yara_file, "r", encoding="utf-8") as f:
87
  for line in f:
88
- match = rule_pattern.match(line)
 
 
89
  if match:
90
  original_name = match.group(1)
91
- if original_name in seen_rules:
92
- logger.info(f"{original_name} is unseen")
93
- rule_counter += 1
94
- seen_rules[original_name] += 1
95
- new_name = f"{original_name}_{seen_rules[original_name]}"
 
 
 
 
 
 
 
 
96
  line = line.replace(original_name, new_name, 1)
 
 
97
  else:
98
- logger.warn(f"{original_name} is SEEN")
99
- seen_rules[original_name] = 0 # First occurrence
100
- new_content.append(line)
101
-
102
- # Save the modified file
103
- processed_file = processed_dir / yara_file.name
 
104
  with open(processed_file, "w", encoding="utf-8") as f:
105
  f.writelines(new_content)
106
- logger.info(f"Total of {rule_counter} rule(s) are duplicates. They have been renamed.")
 
107
  return processed_dir
108
 
109
  def compile_yara_rules(repo_path: Path) -> Optional[yara.Rules]:
 
74
  shutil.rmtree(processed_dir)
75
  processed_dir.mkdir()
76
 
77
+ rule_pattern = re.compile(r"rule\s+([^\s{]+)")
78
+ seen_rules = {}
79
+ rule_counter = 0
80
 
81
  for yara_file in repo_path.glob("**/*.yara"):
82
  if yara_file.name == "misc.yara":
83
  logger.info(f"Skipping {yara_file} as it does not belong to any malware family")
84
  continue
85
+
86
+ # Preserve directory structure
87
+ try:
88
+ relative_path = yara_file.relative_to(repo_path)
89
+ except ValueError:
90
+ continue # Skip files not in repo path
91
+
92
+ processed_file = processed_dir / relative_path
93
+ processed_file.parent.mkdir(parents=True, exist_ok=True)
94
+
95
  new_content = []
96
+ with open(yara_file, "r", encoding="utf-8", errors='replace') as f:
97
  for line in f:
98
+ line = line.rstrip('\n')
99
+ match = rule_pattern.match(line.strip())
100
+
101
  if match:
102
  original_name = match.group(1)
103
+ # Sanitize rule name
104
+ clean_name = re.sub(r'[^a-zA-Z0-9_]', '_', original_name)
105
+
106
+ # Ensure valid starting character
107
+ if not clean_name:
108
+ clean_name = "invalid_rule"
109
+ elif not clean_name[0].isalpha() and clean_name[0] != '_':
110
+ clean_name = f"_{clean_name}"
111
+
112
+ # Handle duplicates
113
+ if clean_name in seen_rules:
114
+ seen_rules[clean_name] += 1
115
+ new_name = f"{clean_name}_{seen_rules[clean_name]}"
116
  line = line.replace(original_name, new_name, 1)
117
+ rule_counter += 1
118
+ logger.debug(f"Renamed duplicate rule: {original_name} -> {new_name}")
119
  else:
120
+ seen_rules[clean_name] = 0 # Initialize count
121
+ if clean_name != original_name:
122
+ line = line.replace(original_name, clean_name, 1)
123
+ logger.debug(f"Sanitized rule name: {original_name} -> {clean_name}")
124
+
125
+ new_content.append(line + '\n')
126
+
127
  with open(processed_file, "w", encoding="utf-8") as f:
128
  f.writelines(new_content)
129
+
130
+ logger.info(f"Processed {rule_counter} duplicate rules")
131
  return processed_dir
132
 
133
  def compile_yara_rules(repo_path: Path) -> Optional[yara.Rules]: