aarushi-211 commited on
Commit
92c4d54
·
1 Parent(s): cd12f00

Added modules

Browse files
graphs/__init__.py ADDED
File without changes
graphs/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (223 Bytes). View file
 
graphs/__pycache__/graph_utils.cpython-311.pyc ADDED
Binary file (1.63 kB). View file
 
graphs/graph_utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def format_graph_context(func_summaries, call_graph=None, class_hierarchy=None):
2
+ """
3
+ Format function summaries and graph information into a readable structured input.
4
+ """
5
+ lines = ["You are summarizing a Python module."]
6
+
7
+ if func_summaries:
8
+ lines.append("Function Summaries:")
9
+ for name, summary in func_summaries.items():
10
+ lines.append(f"- {name}: {summary}")
11
+
12
+ if call_graph:
13
+ lines.append("\nCall Graph:")
14
+ for caller, callees in call_graph.items():
15
+ if callees:
16
+ lines.append(f"- {caller} → {', '.join(callees)}")
17
+
18
+ if class_hierarchy:
19
+ lines.append("\nClass Hierarchy:")
20
+ for cls, methods in class_hierarchy.items():
21
+ lines.append(f"- {cls}: [{', '.join(methods)}]")
22
+
23
+ return "\n".join(lines)
models/__init__.py ADDED
File without changes
models/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (223 Bytes). View file
 
models/__pycache__/models.cpython-311.pyc ADDED
Binary file (1.16 kB). View file
 
models/models.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ device = 0 if torch.cuda.is_available() else -1
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base-multi-sum")
8
+ model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base-multi-sum")
9
+
10
+ func_summarizer = pipeline(
11
+ "text2text-generation",
12
+ model=model,
13
+ tokenizer=tokenizer,
14
+ device=device,
15
+ batch_size=8,
16
+ )
17
+
18
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
19
+
20
+ file_summarizer = pipeline(
21
+ "summarization",
22
+ model="allenai/led-base-16384",
23
+ tokenizer="allenai/led-base-16384",
24
+ device=device,
25
+ truncation=True,
26
+ max_length=128,
27
+ min_length=64,
28
+ )
parsers/__init__.py ADDED
File without changes
parsers/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (224 Bytes). View file
 
parsers/__pycache__/parsers.cpython-311.pyc ADDED
Binary file (2.8 kB). View file
 
parsers/parsers.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+
3
+ def extract_functions_from_code(code: str):
4
+ try:
5
+ tree = ast.parse(code)
6
+ return {
7
+ node.name: ast.get_source_segment(code, node)
8
+ for node in ast.walk(tree)
9
+ if isinstance(node, ast.FunctionDef)
10
+ }
11
+ except:
12
+ return {}
13
+
14
+ def extract_call_graph(code: str):
15
+ call_graph = {}
16
+ try:
17
+ tree = ast.parse(code)
18
+ for node in ast.walk(tree):
19
+ if isinstance(node, ast.FunctionDef):
20
+ callers = []
21
+ for child in ast.walk(node):
22
+ if isinstance(child, ast.Call) and hasattr(child.func, "id"):
23
+ callers.append(child.func.id)
24
+ call_graph[node.name] = list(set(callers))
25
+ except:
26
+ pass
27
+ return call_graph
28
+
29
+ def extract_class_hierarchy(code: str):
30
+ class_map = {}
31
+ try:
32
+ tree = ast.parse(code)
33
+ for node in ast.walk(tree):
34
+ if isinstance(node, ast.ClassDef):
35
+ methods = [n.name for n in node.body if isinstance(n, ast.FunctionDef)]
36
+ class_map[node.name] = methods
37
+ except:
38
+ pass
39
+ return class_map
summarizers/__init__.py ADDED
File without changes
summarizers/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (228 Bytes). View file
 
summarizers/__pycache__/file_summarizers.cpython-311.pyc ADDED
Binary file (3 kB). View file
 
summarizers/__pycache__/repo_summarizers.cpython-311.pyc ADDED
Binary file (1.74 kB). View file
 
summarizers/file_summarizers.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from parsers.parsers import extract_functions_from_code, extract_call_graph, extract_class_hierarchy
2
+ from graphs.graph_utils import format_graph_context
3
+ from models.models import func_summarizer, file_summarizer
4
+
5
+ def summarize_file_with_graph(code_text: str, top_k: int = 5):
6
+ """
7
+ Summarize a file by:
8
+ - Extracting function-level summaries using CodeT5
9
+ - Building call/class hierarchy (graph_utils)
10
+ - Creating structured prompt for LED summarizer
11
+ """
12
+ functions = extract_functions_from_code(code_text)
13
+ if not functions:
14
+ return "No functions found."
15
+
16
+ func_names = list(functions.keys())
17
+ func_bodies = list(functions.values())
18
+
19
+ # Summarize functions
20
+ func_summaries_raw = func_summarizer(func_bodies, max_length=64, do_sample=False)
21
+ func_summaries = {
22
+ func_names[i]: func_summaries_raw[i]["generated_text"].strip()
23
+ for i in range(len(func_names))
24
+ }
25
+
26
+ # Select top-k longest functions as proxy for importance
27
+ top_funcs = sorted(func_summaries.items(), key=lambda x: len(functions[x[0]]), reverse=True)[:top_k]
28
+ top_func_summaries = {k: v for k, v in top_funcs}
29
+
30
+ # Build graph context
31
+ call_graph = extract_call_graph(code_text)
32
+ class_hierarchy = extract_class_hierarchy(code_text)
33
+ input_text = format_graph_context(top_func_summaries, call_graph, class_hierarchy)
34
+
35
+ # Summarize with LED
36
+ summary = file_summarizer(
37
+ input_text,
38
+ max_length=128,
39
+ min_length=64,
40
+ no_repeat_ngram_size=3,
41
+ do_sample=False,
42
+ )[0]["summary_text"]
43
+
44
+ return summary
summarizers/repo_summarizers.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from summarizers.file_summarizers import summarize_file_with_graph
2
+ from models.models import file_summarizer
3
+
4
+ def summarize_repo_with_graph(file_dict: dict, top_files=5, top_k_funcs=5):
5
+ """
6
+ Summarize a repository:
7
+ - Summarize each file using summarize_file_with_graph
8
+ - Combine top-k summaries
9
+ - Feed to LED summarizer
10
+ """
11
+ file_summaries = []
12
+
13
+ for file_path, code_text in list(file_dict.items())[:top_files]:
14
+ try:
15
+ summary = summarize_file_with_graph(code_text, top_k=top_k_funcs)
16
+ file_summaries.append(summary)
17
+ except Exception as e:
18
+ print(f"Skipped file {file_path} due to: {e}")
19
+
20
+ if not file_summaries:
21
+ return "No valid summaries found."
22
+
23
+ combined_input = "\n\n".join(file_summaries)
24
+ final_summary = file_summarizer(
25
+ combined_input,
26
+ max_length=256,
27
+ min_length=100,
28
+ no_repeat_ngram_size=3,
29
+ do_sample=False,
30
+ )[0]["summary_text"]
31
+
32
+ return final_summary
33
+
34
+ # from transformers import AutoTokenizer
35
+
36
+ # # reuse the same tokenizer instance you already loaded
37
+ # tokenizer = file_summarizer.tokenizer
38
+
39
+ # def summarize_repo_with_graph(file_dict: dict, top_files=5, top_k_funcs=5):
40
+ # file_summaries = []
41
+
42
+ # for file_path, code_text in list(file_dict.items())[:top_files]:
43
+ # try:
44
+ # summary = summarize_file_with_graph(code_text, top_k=top_k_funcs)
45
+ # file_summaries.append(summary)
46
+ # except Exception as e:
47
+ # print(f"Skipped file {file_path} due to: {e}")
48
+
49
+ # if not file_summaries:
50
+ # return "No valid summaries found."
51
+
52
+ # combined_input = "\n\n".join(file_summaries)
53
+
54
+ # # dynamic length cap
55
+ # tokens = tokenizer.encode(combined_input, truncation=False)
56
+ # suggested_max = min(256, max(100, len(tokens) // 2))
57
+ # suggested_min = min(100, suggested_max - 20)
58
+
59
+ # final_summary = file_summarizer(
60
+ # combined_input,
61
+ # max_length=suggested_max,
62
+ # min_length=suggested_min,
63
+ # no_repeat_ngram_size=3,
64
+ # do_sample=False,
65
+ # )[0]["summary_text"]
66
+
67
+ # return final_summary