|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
NeMo dependency structure definition. |
|
|
This module analyzes the codebase to determine internal dependencies between NeMo collections and core components. |
|
|
""" |
|
|
|
|
|
import ast |
|
|
import json |
|
|
import os |
|
|
from typing import Dict, List, Set |
|
|
|
|
|
|
|
|
def find_python_files(directory: str) -> List[str]: |
|
|
"""Find all Python files in the given directory and its subdirectories.""" |
|
|
python_files = [] |
|
|
|
|
|
relevant_dirs = ['nemo', 'scripts', 'examples', 'tests'] |
|
|
|
|
|
for dir_name in relevant_dirs: |
|
|
dir_path = os.path.join(directory, dir_name) |
|
|
if os.path.exists(dir_path): |
|
|
for root, _, files in os.walk(dir_path): |
|
|
for file in files: |
|
|
if file.endswith('.py'): |
|
|
python_files.append(os.path.join(root, file)) |
|
|
|
|
|
return python_files |
|
|
|
|
|
|
|
|
def analyze_imports(nemo_root: str, file_path: str) -> Set[str]: |
|
|
"""Analyze a Python file and return its NeMo package dependencies using AST parsing.""" |
|
|
imports = set() |
|
|
visited = set() |
|
|
|
|
|
def get_init_imports(module_path: str, depth: int = 0) -> Dict[str, str]: |
|
|
"""Recursively analyze imports from __init__.py files and map them to their final destinations.""" |
|
|
|
|
|
if depth > 10 or module_path in visited: |
|
|
return {} |
|
|
|
|
|
visited.add(module_path) |
|
|
init_path = os.path.join(module_path, '__init__.py') |
|
|
if not os.path.exists(init_path): |
|
|
return {} |
|
|
|
|
|
try: |
|
|
with open(init_path, 'r', encoding='utf-8') as f: |
|
|
init_tree = ast.parse(f.read(), filename=init_path) |
|
|
|
|
|
import_map = {} |
|
|
for node in ast.walk(init_tree): |
|
|
if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'): |
|
|
if node.names: |
|
|
for name in node.names: |
|
|
if name.name == '*': |
|
|
continue |
|
|
|
|
|
|
|
|
module_parts = node.module.split('.') |
|
|
module_dir = os.path.join(nemo_root, *module_parts) |
|
|
|
|
|
|
|
|
if os.path.exists(os.path.join(module_dir, '__init__.py')): |
|
|
sub_imports = get_init_imports(module_dir, depth + 1) |
|
|
if name.name in sub_imports: |
|
|
import_map[name.name] = sub_imports[name.name] |
|
|
else: |
|
|
|
|
|
module_file = os.path.join(module_dir, f"{module_parts[-1]}.py") |
|
|
if os.path.exists(module_file): |
|
|
import_map[name.name] = f"{node.module}.{name.name}" |
|
|
else: |
|
|
|
|
|
import_map[name.name] = f"{node.module}.{name.name}" |
|
|
|
|
|
return import_map |
|
|
except Exception as e: |
|
|
print(f"Error analyzing {init_path}: {e}") |
|
|
return {} |
|
|
|
|
|
try: |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
tree = ast.parse(f.read(), filename=file_path) |
|
|
|
|
|
for node in ast.walk(tree): |
|
|
if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'): |
|
|
|
|
|
parts = node.module.split('.') |
|
|
|
|
|
if len(parts) == 1: |
|
|
continue |
|
|
|
|
|
if len(parts) >= 2: |
|
|
module_type = parts[1] |
|
|
|
|
|
if module_type == 'collections': |
|
|
if len(parts) == 2: |
|
|
continue |
|
|
if node.names: |
|
|
for name in node.names: |
|
|
if name.name == '*': |
|
|
continue |
|
|
|
|
|
|
|
|
module_path = os.path.join(nemo_root, *parts) |
|
|
init_imports = get_init_imports(module_path) |
|
|
|
|
|
if name.name in init_imports: |
|
|
|
|
|
imports.add(init_imports[name.name]) |
|
|
else: |
|
|
imports.add(f"{node.module}.{name.name}") |
|
|
|
|
|
elif module_type in find_top_level_packages(nemo_root): |
|
|
if node.names: |
|
|
for name in node.names: |
|
|
if name.name == '*': |
|
|
continue |
|
|
|
|
|
|
|
|
module_path = os.path.join(nemo_root, *parts) |
|
|
init_imports = get_init_imports(module_path) |
|
|
|
|
|
if name.name in init_imports: |
|
|
|
|
|
imports.add(init_imports[name.name]) |
|
|
else: |
|
|
imports.add(f"{node.module}.{name.name}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error analyzing {file_path}: {e}") |
|
|
|
|
|
return imports |
|
|
|
|
|
|
|
|
def find_top_level_packages(nemo_root: str) -> List[str]: |
|
|
"""Find all top-level packages under nemo directory.""" |
|
|
packages: List[str] = [] |
|
|
nemo_dir = os.path.join(nemo_root, 'nemo') |
|
|
tests_dir = os.path.join(nemo_root, 'tests') |
|
|
|
|
|
if not os.path.exists(nemo_dir): |
|
|
print(f"Warning: nemo directory not found at {nemo_dir}") |
|
|
return packages |
|
|
if not os.path.exists(tests_dir): |
|
|
print(f"Warning: nemo directory not found at {nemo_dir}") |
|
|
return packages |
|
|
|
|
|
for item in os.listdir(nemo_dir) + os.listdir(tests_dir): |
|
|
item_path = os.path.join(nemo_dir, item) |
|
|
if os.path.isdir(item_path) and not item.startswith('__'): |
|
|
packages.append(item) |
|
|
|
|
|
return sorted(packages) |
|
|
|
|
|
|
|
|
def find_collection_modules(nemo_root: str) -> Dict[str, List[str]]: |
|
|
"""Find all modules within collections.""" |
|
|
collection_modules: Dict[str, List[str]] = {} |
|
|
collections_dir = os.path.join(nemo_root, 'nemo', 'collections') |
|
|
|
|
|
if not os.path.exists(collections_dir): |
|
|
print(f"Warning: collections directory not found at {collections_dir}") |
|
|
return collection_modules |
|
|
|
|
|
for collection in os.listdir(collections_dir): |
|
|
collection_path = os.path.join(collections_dir, collection) |
|
|
if os.path.isdir(collection_path) and not collection.startswith('__'): |
|
|
collection_modules[f"nemo.collections.{collection}"] = [] |
|
|
|
|
|
return collection_modules |
|
|
|
|
|
|
|
|
def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]: |
|
|
"""Build a dependency graph by analyzing all Python files.""" |
|
|
|
|
|
top_level_packages = find_top_level_packages(nemo_root) |
|
|
print(f"Found top-level packages: {top_level_packages}") |
|
|
|
|
|
dependencies: Dict[str, List[str]] = {} |
|
|
|
|
|
for file_path in find_python_files(nemo_root): |
|
|
relative_path = os.path.relpath(file_path, nemo_root) |
|
|
|
|
|
parts = relative_path.split(os.sep) |
|
|
|
|
|
if len(parts) == 1 or (parts[0] != "nemo" and parts[0] != "tests"): |
|
|
continue |
|
|
|
|
|
module_path = relative_path.replace(".py", "").replace("/", ".") |
|
|
if parts[1] in top_level_packages and parts[1] != 'collections' and parts[0] != 'tests': |
|
|
dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path))) |
|
|
elif parts[0] == 'tests': |
|
|
dependencies[module_path] = [relative_path.replace("/", ".").replace(".py", "")] |
|
|
elif parts[1] == 'collections': |
|
|
dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path))) |
|
|
|
|
|
|
|
|
reverse_dependencies: Dict[str, List[str]] = {} |
|
|
|
|
|
for package, deps in dependencies.items(): |
|
|
for dep in deps: |
|
|
if dep not in reverse_dependencies: |
|
|
reverse_dependencies[dep] = [] |
|
|
reverse_dependencies[dep].append(package) |
|
|
dependencies = reverse_dependencies |
|
|
|
|
|
|
|
|
transitive_dependencies = dependencies.copy() |
|
|
|
|
|
while True: |
|
|
changes_made = False |
|
|
new_dependencies = transitive_dependencies.copy() |
|
|
|
|
|
|
|
|
for package, deps in transitive_dependencies.items(): |
|
|
|
|
|
for dep in deps: |
|
|
|
|
|
if dep in transitive_dependencies: |
|
|
|
|
|
for transitive_dep in transitive_dependencies[dep]: |
|
|
if transitive_dep not in new_dependencies[package]: |
|
|
new_dependencies[package].append(transitive_dep) |
|
|
changes_made = True |
|
|
|
|
|
|
|
|
transitive_dependencies = new_dependencies |
|
|
|
|
|
|
|
|
if not changes_made: |
|
|
break |
|
|
|
|
|
dependencies = transitive_dependencies |
|
|
|
|
|
|
|
|
simplified_dependencies: Dict[str, List[str]] = {} |
|
|
for package, deps in dependencies.items(): |
|
|
package_parts = package.split('.') |
|
|
|
|
|
if package_parts[0] == "tests": |
|
|
simplified_package_path = f"{os.path.join(*package_parts)}.py" |
|
|
elif os.path.isfile((file_path := f"{os.path.join(*package_parts[:-1])}.py")): |
|
|
simplified_package_path = file_path |
|
|
elif os.path.isdir((file_path := f"{os.path.join(*package_parts[:-1])}")): |
|
|
simplified_package_path = file_path |
|
|
else: |
|
|
simplified_package_path = package |
|
|
|
|
|
for dep in deps: |
|
|
dep_parts = dep.split('.') |
|
|
|
|
|
if simplified_package_path not in simplified_dependencies: |
|
|
simplified_dependencies[simplified_package_path] = [] |
|
|
|
|
|
if ( |
|
|
len(dep_parts) >= 2 |
|
|
and (dep_parts[1] in find_top_level_packages(nemo_root)) |
|
|
and dep_parts[1] != 'collections' |
|
|
): |
|
|
simplified_dependencies[simplified_package_path].append(f"{dep_parts[0]}.{dep_parts[1]}") |
|
|
elif dep_parts[0] == "tests": |
|
|
simplified_dependencies[simplified_package_path].append(".".join(dep_parts)) |
|
|
elif len(dep_parts) >= 3 and ( |
|
|
simplified_name := f"nemo.{dep_parts[1]}.{dep_parts[2]}" |
|
|
) in find_collection_modules(nemo_root): |
|
|
simplified_dependencies[simplified_package_path].append(simplified_name) |
|
|
|
|
|
simplified_dependencies[simplified_package_path].append(package) |
|
|
simplified_dependencies[simplified_package_path] = sorted( |
|
|
list(set(simplified_dependencies[simplified_package_path])) |
|
|
) |
|
|
dependencies = simplified_dependencies |
|
|
|
|
|
|
|
|
bucket_deps: Dict[str, List[str]] = {} |
|
|
for package, deps in dependencies.items(): |
|
|
new_deps = [] |
|
|
for dep in deps: |
|
|
if ( |
|
|
"nemo.collections.asr" in dep |
|
|
or "nemo.collections.tts" in dep |
|
|
or "nemo.collections.speechlm" in dep |
|
|
or "nemo.collections.audio" in dep |
|
|
or "tests.collections.asr" in dep |
|
|
or "tests.collections.tts" in dep |
|
|
or "tests.collections.speechlm" in dep |
|
|
or "tests.collections.audio" in dep |
|
|
): |
|
|
new_deps.append("speech") |
|
|
new_deps.append("unit-tests") |
|
|
|
|
|
if "nemo.export" in dep or "nemo.deploy" in dep or "tests.export" in dep or "tests.deploy" in dep: |
|
|
new_deps.append("export-deploy") |
|
|
new_deps.append("unit-tests") |
|
|
|
|
|
if ( |
|
|
"nemo.collections.llm" in dep |
|
|
or "nemo.collections.vlm" in dep |
|
|
or "nemo.automodel" in dep |
|
|
or "tests.collections.llm" in dep |
|
|
or "tests.collections.vlm" in dep |
|
|
or "tests.automodel" in dep |
|
|
): |
|
|
new_deps.append("automodel") |
|
|
new_deps.append("unit-tests") |
|
|
|
|
|
if "tests" in dep and "tests.functional_tests" not in dep: |
|
|
new_deps.append("unit-tests") |
|
|
|
|
|
if ( |
|
|
"nemo.collections" in dep |
|
|
and "nemo.collections.asr" not in dep |
|
|
and "nemo.collections.tts" not in dep |
|
|
and "nemo.collections.speechlm" not in dep |
|
|
and "nemo.collections.audio" not in dep |
|
|
and "tests.collections.asr" not in dep |
|
|
and "tests.collections.tts" not in dep |
|
|
and "tests.collections.speechlm" not in dep |
|
|
and "tests.collections.audio" not in dep |
|
|
): |
|
|
new_deps.append("nemo2") |
|
|
new_deps.append("unit-tests") |
|
|
|
|
|
bucket_deps[package] = sorted(list(set(new_deps))) |
|
|
|
|
|
dependencies = bucket_deps |
|
|
|
|
|
|
|
|
|
|
|
requirements_dir = os.path.join(nemo_root, "requirements") |
|
|
if os.path.exists(requirements_dir): |
|
|
for filename in os.listdir(requirements_dir): |
|
|
filepath = os.path.join("requirements", filename) |
|
|
relative_path = os.path.relpath(filepath, nemo_root) |
|
|
|
|
|
dependencies[relative_path] = [ |
|
|
"nemo2", |
|
|
"unit-tests", |
|
|
"speech", |
|
|
"automodel", |
|
|
"export-deploy", |
|
|
] |
|
|
|
|
|
|
|
|
for root, _, files in os.walk(nemo_root): |
|
|
for file_path in files: |
|
|
full_path = os.path.join(root, file_path) |
|
|
relative_path = os.path.relpath(full_path, nemo_root) |
|
|
|
|
|
if "cicd-main-export-deploy" in file_path: |
|
|
dependencies[relative_path] = ["export-deploy"] |
|
|
if "cicd-main-nemo2" in file_path: |
|
|
dependencies[relative_path] = ["nemo2"] |
|
|
if "cicd-main-speech" in file_path: |
|
|
dependencies[relative_path] = ["speech"] |
|
|
if "cicd-main-automodel" in file_path: |
|
|
dependencies[relative_path] = ["automodel"] |
|
|
if "cicd-main-unit-tests" in file_path: |
|
|
dependencies[relative_path] = ["unit-tests"] |
|
|
if "Dockerfile" in file_path: |
|
|
dependencies[relative_path] = ["nemo2", "unit-tests", "speech", "automodel", "export-deploy"] |
|
|
|
|
|
|
|
|
dependencies = dict(sorted(dependencies.items(), key=lambda x: len(x[1]), reverse=True)) |
|
|
|
|
|
return dependencies |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main function to analyze dependencies and output JSON.""" |
|
|
|
|
|
nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
|
|
|
|
|
|
dependencies = build_dependency_graph(nemo_root) |
|
|
|
|
|
|
|
|
data = json.dumps(dependencies, indent=4) |
|
|
|
|
|
with open('nemo_dependencies.json', 'w', encoding='utf-8') as f: |
|
|
f.write(data) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|