nihalaninihal commited on
Commit
f8d227c
·
verified ·
1 Parent(s): bb1beb0

Delete analyze_repository_structure.py

Browse files
Files changed (1) hide show
  1. analyze_repository_structure.py +0 -553
analyze_repository_structure.py DELETED
@@ -1,553 +0,0 @@
1
- from pathlib import Path
2
- import json
3
- from typing import Dict, List, Any
4
- from typing import Optional
5
- from prompt_analyzer import create_handler
6
- import time
7
-
8
- RELEVANT_EXTENSIONS = {
9
- ".py",
10
- ".js",
11
- ".ts",
12
- ".jsx",
13
- ".tsx",
14
- ".java",
15
- ".cpp",
16
- ".c",
17
- ".h",
18
- ".hpp",
19
- ".rb",
20
- ".php",
21
- ".go",
22
- ".rs",
23
- ".swift",
24
- ".kt",
25
- ".kts",
26
- ".scala",
27
- ".pl",
28
- ".pm",
29
- ".r",
30
- ".sh",
31
- ".bat",
32
- ".ps1",
33
- ".lua",
34
- ".sql",
35
- ".html",
36
- ".css",
37
- ".xml",
38
- ".json",
39
- ".yaml",
40
- ".yml",
41
- ".md",
42
- ".ipynb",
43
- ".m",
44
- ".mm",
45
- ".vb",
46
- ".cs",
47
- ".fs",
48
- ".fsx",
49
- ".erl",
50
- ".hrl",
51
- ".ex",
52
- ".exs",
53
- ".dart",
54
- ".groovy",
55
- ".jl",
56
- ".clj",
57
- ".cljs",
58
- ".coffee",
59
- ".litcoffee",
60
- ".rkt",
61
- ".hs",
62
- ".lhs",
63
- ".ml",
64
- ".mli",
65
- ".nim",
66
- ".cr",
67
- ".nimble",
68
- ".hx",
69
- ".hxsl",
70
- ".hxproj",
71
- ".hxcpp",
72
- ".hxcs",
73
- ".hxjava",
74
- ".hxcpp",
75
- ".hxnode",
76
- ".hxphp",
77
- ".hxpy",
78
- ".hxrb",
79
- ".hxswf",
80
- ".hxvm",
81
- ".hxweb",
82
- ".hxwin",
83
- ".hxwpf",
84
- ".sol",
85
- ".vy",
86
- }
87
-
88
- LANGUAGE_EXTENSIONS = {
89
- ".py": "Python",
90
- ".js": "JavaScript",
91
- ".ts": "TypeScript",
92
- ".jsx": "React",
93
- ".tsx": "React TypeScript",
94
- ".java": "Java",
95
- ".cpp": "C++",
96
- ".c": "C",
97
- ".h": "C/C++ Header",
98
- ".hpp": "C++ Header",
99
- ".rb": "Ruby",
100
- ".php": "PHP",
101
- ".go": "Go",
102
- ".rs": "Rust",
103
- ".swift": "Swift",
104
- ".kt": "Kotlin",
105
- ".kts": "Kotlin Script",
106
- ".scala": "Scala",
107
- ".pl": "Perl",
108
- ".pm": "Perl Module",
109
- ".r": "R",
110
- ".sh": "Shell",
111
- ".bat": "Batch",
112
- ".ps1": "PowerShell",
113
- ".lua": "Lua",
114
- ".sql": "SQL",
115
- ".html": "HTML",
116
- ".css": "CSS",
117
- ".xml": "XML",
118
- ".json": "JSON",
119
- ".yaml": "YAML",
120
- ".yml": "YAML",
121
- ".md": "Markdown",
122
- ".ipynb": "Jupyter Notebook",
123
- ".m": "MATLAB/Objective-C",
124
- ".mm": "Objective-C++",
125
- ".vb": "Visual Basic",
126
- ".cs": "C#",
127
- ".fs": "F#",
128
- ".fsx": "F# Script",
129
- ".erl": "Erlang",
130
- ".hrl": "Erlang Header",
131
- ".ex": "Elixir",
132
- ".exs": "Elixir Script",
133
- ".dart": "Dart",
134
- ".groovy": "Groovy",
135
- ".jl": "Julia",
136
- ".clj": "Clojure",
137
- ".cljs": "ClojureScript",
138
- ".coffee": "CoffeeScript",
139
- ".litcoffee": "Literate CoffeeScript",
140
- ".rkt": "Racket",
141
- ".hs": "Haskell",
142
- ".lhs": "Literate Haskell",
143
- ".ml": "OCaml",
144
- ".mli": "OCaml Interface",
145
- ".nim": "Nim",
146
- ".cr": "Crystal",
147
- ".nimble": "Nimble",
148
- ".hx": "Haxe",
149
- ".hxsl": "Haxe Shader",
150
- ".hxproj": "Haxe Project",
151
- ".hxcpp": "Haxe C++",
152
- ".hxcs": "Haxe C#",
153
- ".hxjava": "Haxe Java",
154
- ".hxnode": "Haxe Node.js",
155
- ".hxphp": "Haxe PHP",
156
- ".hxpy": "Haxe Python",
157
- ".hxrb": "Haxe Ruby",
158
- ".hxswf": "Haxe SWF",
159
- ".hxvm": "Haxe VM",
160
- ".hxweb": "Haxe Web",
161
- ".hxwin": "Haxe Windows",
162
- ".hxwpf": "Haxe WPF",
163
- ".sol": "Solidity",
164
- ".vy": "Vyper",
165
- }
166
-
167
- PACKAGE_FILES = {
168
- "package.json": "npm",
169
- "requirements.txt": "pip",
170
- "setup.py": "python",
171
- "pom.xml": "maven",
172
- "build.gradle": "gradle",
173
- "Gemfile": "bundler",
174
- "Cargo.toml": "cargo",
175
- "go.mod": "go",
176
- "go.sum": "go",
177
- "composer.json": "composer",
178
- "pubspec.yaml": "dart",
179
- "Project.toml": "julia",
180
- "mix.exs": "elixir",
181
- "Makefile": "make",
182
- "CMakeLists.txt": "cmake",
183
- "SConstruct": "scons",
184
- "build.xml": "ant",
185
- "Rakefile": "rake",
186
- "shard.yml": "crystal",
187
- "nim.cfg": "nim",
188
- "default.nix": "nix",
189
- "stack.yaml": "haskell",
190
- "rebar.config": "erlang",
191
- "rebar.lock": "erlang",
192
- "rebar3.config": "erlang",
193
- "rebar3.lock": "erlang",
194
- "project.clj": "leiningen",
195
- "deps.edn": "clojure",
196
- "build.boot": "boot",
197
- "build.sbt": "sbt",
198
- "Brewfile": "homebrew",
199
- "Vagrantfile": "vagrant",
200
- "Dockerfile": "docker",
201
- "docker-compose.yml": "docker-compose",
202
- "Procfile": "heroku",
203
- "tox.ini": "tox",
204
- "pyproject.toml": "poetry",
205
- "Pipfile": "pipenv",
206
- "Pipfile.lock": "pipenv",
207
- "environment.yml": "conda",
208
- "meta.yaml": "conda",
209
- }
210
-
211
-
212
- def analyze_repository_structure(repo_names: List[str], user_path: Path) -> Dict[str, Any]:
213
- """Processes source code from repositories to build LLM-friendly structure"""
214
- result = {}
215
-
216
- for repo_name in repo_names:
217
- username = user_path.name
218
- repo_path = (
219
- user_path / f"{username}_{repo_name}.git"
220
- )
221
-
222
- print("processing,", repo_name, "path:", repo_path)
223
-
224
- if not repo_path.exists():
225
- print("skipping")
226
- continue
227
-
228
- # Get the structure first
229
- structure = _build_tree_structure(repo_path)
230
-
231
- # Count language occurrences from the structure
232
- language_counts = {}
233
- for file_info in _get_source_files(structure):
234
- extension = file_info["extension"].lower()
235
- if extension in LANGUAGE_EXTENSIONS:
236
- language = LANGUAGE_EXTENSIONS[extension]
237
- language_counts[language] = language_counts.get(language, 0) + 1
238
-
239
- # Sort languages by frequency, most common first
240
- languages = sorted(
241
- language_counts.items(),
242
- key=lambda x: (-x[1], x[0]) # Sort by count descending, then name ascending
243
- )
244
-
245
- # Create the language string
246
- languages_str = ", ".join(lang for lang, _ in languages)
247
-
248
- result[repo_name] = {
249
- "structure": structure,
250
- "file_stats": _analyze_file_statistics(repo_path),
251
- "documentation": _extract_documentation(repo_path),
252
- "languages": languages_str
253
- }
254
-
255
- _extract_code_samples(result, user_path)
256
-
257
- return result
258
-
259
-
260
- def _build_tree_structure(repo_path: Path, files_per_dir: int = 20, max_depth: int = 3) -> Dict[str, Any]:
261
- """
262
- Builds a tree representation of repository structure with limits.
263
-
264
- Args:
265
- repo_path: Repository path
266
- files_per_dir: Maximum number of files to include per directory (default: 20)
267
- max_depth: Maximum depth for nested directories (default: 3)
268
- """
269
- def create_tree(path: Path, current_depth: int = 0) -> Dict[str, Any]:
270
- tree = {
271
- "type": "directory",
272
- "name": path.name,
273
- "path": str(path.relative_to(repo_path)),
274
- "children": [],
275
- }
276
-
277
- # Stop traversing if we hit max depth
278
- if current_depth >= max_depth:
279
- tree["children"] = [{
280
- "type": "note",
281
- "message": f"Directory depth limit ({max_depth}) reached"
282
- }]
283
- return tree
284
-
285
- try:
286
- items = list(path.iterdir())
287
-
288
- # Skip git directory and common build artifacts
289
- if path.name in {
290
- ".git",
291
- "node_modules",
292
- "__pycache__",
293
- "build",
294
- "dist",
295
- }:
296
- return tree
297
-
298
- # Process files with limit
299
- files = [
300
- item for item in items
301
- if item.is_file() and item.suffix.lower() in RELEVANT_EXTENSIONS
302
- ]
303
- if files:
304
- files = files[:files_per_dir] # Limit number of files
305
- for item in files:
306
- tree["children"].append({
307
- "type": "file",
308
- "name": item.name,
309
- "path": str(item.relative_to(repo_path)),
310
- "extension": item.suffix.lower(),
311
- "size": item.stat().st_size,
312
- })
313
-
314
- # Process directories
315
- dirs = [item for item in items if item.is_dir()]
316
- for item in dirs:
317
- subtree = create_tree(item, current_depth + 1)
318
- if subtree["children"]: # Only add non-empty directories
319
- tree["children"].append(subtree)
320
-
321
- except PermissionError:
322
- pass
323
-
324
- return tree
325
-
326
- return create_tree(repo_path)
327
-
328
-
329
- def _analyze_file_statistics(repo_path: Path) -> Dict[str, Any]:
330
- """Analyzes file statistics for the repository"""
331
-
332
- file_count = 0
333
- total_loc = 0
334
-
335
- for ext in LANGUAGE_EXTENSIONS:
336
- for file_path in repo_path.rglob(f"*{ext}"):
337
- if not any(p in str(file_path) for p in RELEVANT_EXTENSIONS):
338
- continue
339
-
340
- try:
341
- with open(file_path, "r", encoding="utf-8") as f:
342
- content = f.read()
343
- loc = len([l for l in content.splitlines() if l.strip()])
344
- total_loc += loc
345
- file_count += 1
346
- except (UnicodeDecodeError, PermissionError):
347
- continue
348
-
349
- return {
350
- "file_count": file_count,
351
- "total_loc": total_loc,
352
- }
353
-
354
-
355
- def _extract_documentation(repo_path: Path) -> Dict[str, Any]:
356
- """Extracts documentation and metadata from repository"""
357
- docs = {}
358
-
359
- # Look for README
360
- readme_paths = list(repo_path.glob("README*"))
361
- if readme_paths:
362
- try:
363
- with open(readme_paths[0], "r", encoding="utf-8") as f:
364
- docs["readme"] = f.read()
365
- except (UnicodeDecodeError, PermissionError):
366
- docs["readme"] = None
367
-
368
- docs["package_info"] = {}
369
- for filename, pkg_type in PACKAGE_FILES.items():
370
- pkg_path = repo_path / filename
371
- if pkg_path.exists():
372
- try:
373
- with open(pkg_path, "r", encoding="utf-8") as f:
374
- docs["package_info"][pkg_type] = f.read()
375
- except (UnicodeDecodeError, PermissionError):
376
- continue
377
-
378
- return docs
379
-
380
-
381
- def _extract_code_samples(sources_data: Dict[str, Any], user_path: Path, max_file_size: int = 100000) -> Dict[str, Any]:
382
- """
383
- Extracts code samples for files identified as relevant by Gemini.
384
- Filters out files larger than max_file_size bytes.
385
- """
386
- handler = create_handler()
387
-
388
- try:
389
- # Preprocess to remove large files from consideration
390
- filtered_structures = {}
391
- for repo_name, repo_data in sources_data.items():
392
- structure_copy = repo_data["structure"].copy()
393
-
394
- # Filter function to remove large files
395
- def filter_large_files(node):
396
- if node.get("type") == "directory":
397
- node["children"] = [
398
- child for child in node.get("children", [])
399
- if child.get("type") == "directory"
400
- or (child.get("type") == "file" and child.get("size", 0) <= max_file_size)
401
- ]
402
- for child in node["children"]:
403
- if child.get("type") == "directory":
404
- filter_large_files(child)
405
- return node
406
-
407
- # Apply filter
408
- filtered_structures[repo_name] = filter_large_files(structure_copy)
409
-
410
- # Create a combined prompt for all repositories
411
- prompt = f"""
412
- Analyze the repository structures and identify the most relevant files for codebase analysis.
413
-
414
- Focus on files that would reveal:
415
- 1. Core functionality and architecture
416
- 2. Main business logic
417
- 3. Key utilities and helpers
418
- 4. Configuration and setup
419
-
420
- Results will be used for further code analysis. Remember to include ALL relevant files, especially for fullstack applications. Be thorough but concise. Avoid including non-original code, e.g., dependencies or libraries code. AVOID INCLUDING MORE THAN 50 FILES PER REPOSITORY!!! TRY TO INCLUDE LESS THAN 20 IF POSSIBLE. CORE_FILES ARE THE PRIORITY, YOU CAN OMITT THE REST IF IT EXCEEDS THE LIMIT.
421
-
422
- Return a JSON object with these categories:
423
-
424
- {{
425
- "repositories": {{ // MANDATORY highest level key
426
- "repo_name": {{ // MANDATORY name of the repository you are analyzing
427
- "core_files": ["list of most important files"], // MAX 20 files!
428
- "secondary_files": ["list of supporting files"], // MAX 20 files!
429
- "config_files": ["list of relevant config files"] // MAX 10 files!
430
- }},
431
- "repo_name": {{...}},
432
- }}
433
- }}
434
-
435
- CRITICAL REQUIREMENTS:
436
-
437
- Limit each list of most important files to a maximum of 20 files!!!
438
-
439
- Avoid including binary files or large data files. Only include files that are essential for understanding the codebase. Avoid including too many files, focus on the most important ones. Avoid including files that user did not write, e.g., dependencies or libraries code. Avoid including utility files that are not essential for understanding the codebase. Focus on including only source code, some repositories may have a lot of files, but only a few are essential for understanding the codebase. Do not include long .json files or other artifact type of files - notice "size" of the file in the structure.
440
-
441
- Repository structures:
442
- {json.dumps(filtered_structures, indent=2)}
443
-
444
- Only include files that exist in the structure. Return valid JSON format.
445
- DO NOT wrap the JSON in markdown code blocks.
446
- """
447
-
448
- # Get file categories for all repositories
449
- file_categories = handler.generate_json_response(prompt)
450
-
451
- if not file_categories:
452
- print("Skipping due to API error")
453
- return sources_data
454
-
455
- for repo_name, repo_data in sources_data.items():
456
- repo_data["samples"] = {
457
- "core_files": {},
458
- "utility_files": {},
459
- "config_files": {}
460
- }
461
-
462
- # Filter out large files from consideration
463
- all_files = {
464
- file_info["path"]: file_info
465
- for file_info in _get_source_files(repo_data["structure"])
466
- if file_info.get("size", 0) <= max_file_size
467
- }
468
-
469
- for category in ["core_files", "utility_files", "config_files"]:
470
- for file_path in file_categories["repositories"].get(repo_name, {}).get(category, []):
471
- if file_path not in all_files:
472
- continue
473
-
474
- source_code = _read_source_file(user_path, repo_name, file_path)
475
- if source_code:
476
- repo_data["samples"][category][file_path] = source_code
477
-
478
- except Exception as e:
479
- print(f"Error processing code samples: {str(e)}")
480
-
481
- return sources_data
482
-
483
-
484
- def _get_source_files(structure: Dict[str, Any]) -> List[Dict[str, Any]]:
485
- """Helper to recursively extract source files from tree structure"""
486
- files = []
487
-
488
- def traverse(node: Dict[str, Any]):
489
- if not isinstance(node, dict):
490
- return
491
-
492
- # If it's a file, add it
493
- if node.get("type") == "file":
494
- files.append(node)
495
-
496
- # If it's a directory, traverse its children
497
- elif node.get("type") == "directory" and "children" in node:
498
- for child in node.get("children", []):
499
- traverse(child)
500
-
501
- # Also check any other dictionaries that might contain nested structures
502
- for value in node.values():
503
- if isinstance(value, dict):
504
- traverse(value)
505
- elif isinstance(value, list):
506
- for item in value:
507
- if isinstance(item, dict):
508
- traverse(item)
509
-
510
- traverse(structure)
511
-
512
- # Sort files by path for consistent ordering
513
- return sorted(files, key=lambda x: x["path"])
514
-
515
-
516
- def _read_source_file(user_path: Path, repo_name: str, file_path: str) -> Optional[str]:
517
- """Reads source code from file with proper error handling"""
518
- try:
519
- # Construct the full path to the source file
520
- full_path = user_path / f"{user_path.name}_{repo_name}.git" / file_path
521
-
522
- # Check if file exists and is readable
523
- if not full_path.is_file():
524
- return None
525
-
526
- # Common binary file extensions to skip
527
- if full_path.suffix.lower() not in RELEVANT_EXTENSIONS:
528
- return None
529
-
530
- # Try to read the file with different encodings
531
- encodings = ["utf-8", "latin-1", "cp1252"]
532
-
533
- for encoding in encodings:
534
- try:
535
- with open(full_path, "r", encoding=encoding) as f:
536
- content = f.read()
537
-
538
- # Basic validation of text content
539
- if "\0" in content: # Binary file check
540
- return None
541
-
542
- return content
543
- except UnicodeDecodeError:
544
- continue
545
- except Exception as e:
546
- print(f"Error reading {full_path}: {str(e)}")
547
- return None
548
-
549
- return None
550
-
551
- except Exception as e:
552
- print(f"Error accessing {file_path}: {str(e)}")
553
- return None