nihalaninihal commited on
Commit
dd3aad7
·
verified ·
1 Parent(s): f8d227c

Delete analyze_temportal_patterns.py

Browse files
Files changed (1) hide show
  1. analyze_temportal_patterns.py +0 -466
analyze_temportal_patterns.py DELETED
@@ -1,466 +0,0 @@
1
- from collections import Counter
2
- import json
3
- from datetime import datetime
4
- import statistics
5
- from typing import Any, Dict, List, Tuple
6
- from analyze_repository_structure import RELEVANT_EXTENSIONS
7
- from pathlib import Path
8
- from prompt_analyzer import create_handler
9
- import subprocess
10
- import os
11
-
12
-
13
- def analyze_temporal_patterns(
14
- sources_data: Dict[str, Any], report_data: Dict[str, Any]
15
- ) -> Dict[str, Any]:
16
- """Analyzes temporal patterns using both LLM and statistical analysis"""
17
-
18
- commits = report_data.get("commits", {})
19
-
20
- # Setup LLM Prompting
21
- handler = create_handler()
22
- combined_results = {}
23
-
24
- # Get commit timestamps for activity analysis
25
- commit_times = [
26
- datetime.fromisoformat(
27
- commit["commit"]["author"]["date"].replace("Z", "+00:00")
28
- )
29
- for repo_commits in commits.values()
30
- for commit in repo_commits
31
- ]
32
-
33
- # Get best targets and their commit contents
34
- temporal_best_targets = _select_best_targets(sources_data, commits)
35
- commit_contents = _get_commit_contents(temporal_best_targets, sources_data)
36
-
37
- # Save commit contents for inspection
38
- inspection_data = {
39
- "temporal_targets": temporal_best_targets,
40
- "commit_contents": commit_contents,
41
- }
42
-
43
- inspection_path = Path("out") / "temporal_analysis_contents.json"
44
- try:
45
- with open(inspection_path, "w", encoding="utf-8") as f:
46
- json.dump(inspection_data, f, indent=2)
47
- print(f"Saved temporal analysis data to {inspection_path}")
48
- except Exception as e:
49
- print(f"Error saving inspection data: {str(e)}")
50
-
51
- for repo_name, repo_data in sources_data.items():
52
- if repo_name not in temporal_best_targets:
53
- continue
54
-
55
- print(f"\nAnalyzing temporal patterns for repository: {repo_name}")
56
-
57
- # Get code changes for this repository
58
- repo_changes = commit_contents.get(repo_name, [])
59
- if not repo_changes:
60
- continue
61
-
62
- # Analyze code style evolution using LLM with actual code changes
63
- prompt = f"""
64
-
65
- TEMPORAL ANALYSIS
66
-
67
- Analyze the temporal evolution of this codebase with focus on developer behavior patterns and code evolution.
68
-
69
- Repository: {repo_name}
70
-
71
- Code Evolution Data:
72
- {json.dumps(repo_changes, indent=2)}
73
-
74
- Generate detailed temporal analysis JSON:
75
- {{
76
- "evolution_patterns": {{
77
- "code_quality": {{
78
- "progression": string,
79
- "refactoring_patterns": [
80
- {{
81
- "pattern": string,
82
- "frequency": string,
83
- "motivation": string
84
- }}
85
- ],
86
- "complexity_trends": {{
87
- "direction": string,
88
- "significant_changes": [string],
89
- "trigger_patterns": [string]
90
- }}
91
- }},
92
- "development_cycles": {{
93
- "commit_patterns": {{
94
- "frequency": {{
95
- "pattern": string,
96
- "active_hours": [string],
97
- "timezone_confidence": {{
98
- "zone": string,
99
- "confidence": number,
100
- "evidence": [string]
101
- }}
102
- }},
103
- "burst_patterns": [
104
- {{
105
- "pattern": string,
106
- "typical_duration": string,
107
- "characteristics": [string]
108
- }}
109
- ]
110
- }},
111
- "feature_development": {{
112
- "typical_cycle": string,
113
- "iteration_patterns": [string],
114
- "testing_integration": string
115
- }}
116
- }},
117
- "communication_patterns": {{
118
- "pr_characteristics": {{
119
- "detail_level": string,
120
- "discussion_style": string,
121
- "iteration_patterns": string
122
- }},
123
- "documentation_evolution": {{
124
- "frequency": string,
125
- "detail_trends": string,
126
- "update_patterns": string
127
- }}
128
- }}
129
- }},
130
- "architectural_evolution": {{
131
- "major_changes": [
132
- {{
133
- "change": string,
134
- "motivation": string,
135
- "impact": string
136
- }}
137
- ],
138
- "improvement_patterns": {{
139
- "refactoring_types": [string],
140
- "optimization_focus": [string],
141
- "maintenance_patterns": string
142
- }},
143
- "technical_debt": {{
144
- "accumulation_patterns": [string],
145
- "resolution_approaches": string,
146
- "prevention_strategies": string
147
- }}
148
- }}
149
- }}
150
-
151
- Requirements:
152
- 1. Focus on developer behavior patterns
153
- 2. Track evolution of coding style
154
- 3. Identify clear timezone patterns
155
- 4. Detail burst activity characteristics
156
- 5. Analyze code quality progression
157
- """
158
-
159
-
160
- try:
161
- result = handler.generate_json_response(prompt)
162
- if result:
163
- combined_results[repo_name] = result
164
- except Exception as e:
165
- print(f"Error analyze_temporal_patterns {repo_name}: {str(e)}")
166
- combined_results[repo_name] = {"error": str(e)}
167
-
168
- return {
169
- "commit_style_metrics": combined_results,
170
- "activity_patterns": _analyze_activity_patterns(commit_times),
171
- }
172
-
173
-
174
- def _clean_diff(diff_output: str) -> str:
175
- """Clean up diff output to focus on actual changes"""
176
- lines = diff_output.split("\n")
177
- cleaned_lines = []
178
- skip_next = False
179
-
180
- for line in lines:
181
- # Skip git-specific headers
182
- if (
183
- line.startswith("diff --git")
184
- or line.startswith("index ")
185
- or line.startswith("new file mode ")
186
- or line.startswith("deleted file mode ")
187
- ):
188
- continue
189
-
190
- # Keep file markers but clean them up
191
- if line.startswith("--- ") or line.startswith("+++ "):
192
- # Convert /dev/null to clearer marker
193
- if "/dev/null" in line:
194
- continue
195
- # Keep just the filename
196
- cleaned_lines.append(line.split("/")[-1])
197
- continue
198
-
199
- # Keep actual diff content
200
- if (
201
- line.startswith("@@ ")
202
- or line.startswith("+")
203
- or line.startswith("-")
204
- or line.startswith(" ")
205
- ):
206
- cleaned_lines.append(line)
207
-
208
- return "\n".join(cleaned_lines)
209
-
210
- def _get_commit_contents(
211
- target_repos: List[str], sources_data: Dict[str, Any], max_diff_lines: int = 100
212
- ) -> Dict[str, List[Dict[str, Any]]]:
213
- """
214
- Retrieves commit contents focusing on core files and limiting diff sizes.
215
- Now with cleaner diff output.
216
- """
217
- commit_contents = {}
218
-
219
- # Extract username from the first repository's path structure
220
- username = None
221
- for repo in sources_data.values():
222
- if repo.get('structure', {}).get('name', ''):
223
- # Extract username from the repository name (format: username_reponame.git)
224
- username = repo['structure']['name'].split('_')[0]
225
- break
226
-
227
- if not username:
228
- raise ValueError("Could not determine username from repository structure")
229
-
230
- for repo_name in target_repos:
231
- # Store the full repo path but don't overwrite repo_name
232
- repo_path_name = sources_data[repo_name]['structure'].get('name', '')
233
-
234
- if not repo_path_name:
235
- print(f"Warning: No path found for repository {repo_name}")
236
- continue
237
-
238
- # Construct correct path using extracted username
239
- repo_path = f"out/{username}/{repo_path_name}"
240
-
241
- # Get core files from sources_data using original repo_name
242
- core_files = sources_data[repo_name].get("samples", {}).get("core_files", {})
243
- if not core_files:
244
- continue
245
-
246
- try:
247
- commits = []
248
- for file_path, _ in core_files.items():
249
- try:
250
- # Get commit history for this file
251
- commit_history = subprocess.check_output(
252
- [
253
- "git",
254
- "log",
255
- "--format=%H %ad",
256
- "--date=iso",
257
- "--reverse",
258
- "--",
259
- file_path,
260
- ],
261
- cwd=repo_path,
262
- text=True,
263
- ).splitlines()
264
-
265
- # Process key commits
266
- commits_to_process = []
267
- if len(commit_history) > 0:
268
- commits_to_process.append(commit_history[0]) # First commit
269
- if len(commit_history) > 4:
270
- # Add some middle commits, evenly spaced
271
- middle_idx = len(commit_history) // 2
272
- commits_to_process.append(commit_history[middle_idx])
273
- if len(commit_history) > 1:
274
- commits_to_process.append(commit_history[-1]) # Last commit
275
-
276
- prev_content = None
277
- for commit_info in commits_to_process:
278
- sha, date = commit_info.split(" ", 1)
279
- try:
280
- # Get the diff for this commit
281
- diff_output = subprocess.check_output(
282
- ["git", "show", "--format=", sha, "--", file_path],
283
- cwd=repo_path,
284
- text=True,
285
- stderr=subprocess.PIPE,
286
- )
287
-
288
- # Skip if diff is too large
289
- diff_lines = diff_output.splitlines()
290
- if len(diff_lines) > max_diff_lines:
291
- continue
292
-
293
- # Clean up the diff
294
- clean_diff = _clean_diff(diff_output)
295
- if not clean_diff.strip():
296
- continue
297
-
298
- # Get actual file content at this commit for first and last commit only
299
- if prev_content is None: # First commit
300
- file_content = subprocess.check_output(
301
- ["git", "show", f"{sha}:{file_path}"],
302
- cwd=repo_path,
303
- text=True,
304
- stderr=subprocess.PIPE,
305
- )
306
- prev_content = file_content
307
- elif commit_info == commits_to_process[-1]: # Last commit
308
- file_content = subprocess.check_output(
309
- ["git", "show", f"{sha}:{file_path}"],
310
- cwd=repo_path,
311
- text=True,
312
- stderr=subprocess.PIPE,
313
- )
314
- else:
315
- file_content = None
316
-
317
- commit_data = {
318
- "sha": sha,
319
- "date": date,
320
- "file": file_path,
321
- "changes": clean_diff,
322
- }
323
-
324
- if file_content:
325
- commit_data["content"] = file_content
326
-
327
- commits.append(commit_data)
328
-
329
- except subprocess.CalledProcessError:
330
- continue
331
-
332
- except subprocess.CalledProcessError:
333
- continue
334
-
335
- if commits:
336
- # Sort commits by date
337
- commits.sort(key=lambda x: x["date"])
338
-
339
- # Group commits by file for better analysis
340
- files_commits = {}
341
- for commit in commits:
342
- file_path = commit["file"]
343
- if file_path not in files_commits:
344
- files_commits[file_path] = []
345
- files_commits[file_path].append(commit)
346
-
347
- commit_contents[repo_name] = {
348
- "core_files": list(core_files.keys()),
349
- "evolution": {
350
- "commit_count": len(commits),
351
- "commits_by_file": files_commits,
352
- },
353
- }
354
-
355
- print(f"Processed {len(commits)} commits for {repo_name} core files")
356
-
357
- except Exception as e:
358
- print(f"Error analyzing repository {repo_name}: {str(e)}")
359
- continue
360
-
361
- return commit_contents
362
-
363
- def _select_best_targets(
364
- sources_data: Dict[str, Any], commits: Dict[str, Any]
365
- ) -> List[str]:
366
- """Selects repositories with sufficient history for analysis"""
367
- targets = []
368
-
369
- for repo_name, repo_data in sources_data.items():
370
- if (
371
- len(commits.get(repo_name, [])) < 5
372
- or repo_data["file_stats"]["file_count"] < 10
373
- ):
374
- continue
375
- targets.append(repo_name)
376
-
377
- return targets
378
-
379
-
380
- def _analyze_activity_patterns(commit_times: List[datetime]) -> Dict[str, Any]:
381
- """Analyzes commit timing patterns"""
382
- if not commit_times:
383
- return {
384
- "frequency": {
385
- "commits_per_day": 0,
386
- "active_hours": [],
387
- "timezone_hint": "unknown",
388
- },
389
- "burst_patterns": {
390
- "intensity": "low",
391
- "average_duration": "n/a",
392
- "frequency": "sporadic",
393
- },
394
- }
395
-
396
- # Sort commit times
397
- commit_times.sort()
398
-
399
- # Calculate commits per day
400
- days_span = (commit_times[-1] - commit_times[0]).days or 1
401
- commits_per_day = round(len(commit_times) / days_span, 2)
402
-
403
- # Analyze active hours
404
- hours = Counter([t.hour for t in commit_times])
405
- active_hours = [
406
- f"{h:02d}-{(h+1):02d}"
407
- for h, c in hours.most_common(3)
408
- if c > len(commit_times) * 0.1
409
- ]
410
-
411
- # Estimate timezone from most active hours
412
- # NOTE: Unclear should show the closest timezone
413
- peak_hour = max(hours.items(), key=lambda x: x[1])[0]
414
- if 4 <= peak_hour <= 8:
415
- tz_hint = "UTC+8 to UTC+10"
416
- elif 8 <= peak_hour <= 12:
417
- tz_hint = "UTC+0 to UTC+2"
418
- elif 12 <= peak_hour <= 16:
419
- tz_hint = "UTC-6 to UTC-4"
420
- elif 16 <= peak_hour <= 20:
421
- tz_hint = "UTC-12 to UTC-8"
422
- else:
423
- tz_hint = "unclear"
424
-
425
- # Analyze burst patterns
426
- time_diffs = []
427
- for i in range(1, len(commit_times)):
428
- diff = (commit_times[i] - commit_times[i - 1]).total_seconds() / 3600
429
- time_diffs.append(diff)
430
-
431
- if time_diffs:
432
- avg_diff = statistics.mean(time_diffs)
433
- if avg_diff < 1:
434
- intensity = "high"
435
- elif avg_diff < 4:
436
- intensity = "moderate"
437
- else:
438
- intensity = "low"
439
-
440
- burst_duration = (
441
- "few hours"
442
- if avg_diff < 4
443
- else "day-length" if avg_diff < 24 else "multi-day"
444
- )
445
- burst_frequency = (
446
- "frequent"
447
- if commits_per_day > 3
448
- else "regular" if commits_per_day > 1 else "sporadic"
449
- )
450
- else:
451
- intensity = "low"
452
- burst_duration = "n/a"
453
- burst_frequency = "sporadic"
454
-
455
- return {
456
- "frequency": {
457
- "commits_per_day": commits_per_day,
458
- "active_hours": active_hours,
459
- "timezone_hint": tz_hint,
460
- },
461
- "burst_patterns": {
462
- "intensity": intensity,
463
- "average_duration": burst_duration,
464
- "frequency": burst_frequency,
465
- },
466
- }