syeedalireza commited on
Commit
5b30d83
·
verified ·
1 Parent(s): 4f1fa6a

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +59 -0
  2. app.py +29 -0
  3. inference.py +65 -0
  4. requirements.txt +4 -0
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python Docstring Generator
2
+
3
+ Generates docstrings for Python code snippets using a sequence-to-sequence model (e.g. T5 or CodeT5). Useful for code summarization and documentation.
4
+
5
+ ## Task
6
+
7
+ Given a Python function or code block (without a docstring), the model produces a short natural-language description suitable as a docstring.
8
+
9
+ ## Model
10
+
11
+ - Uses **Hugging Face Transformers** with a small T5 or CodeT5 checkpoint (e.g. `t5-small`, or `Salesforce/codet5-small` for code).
12
+ - Inference script loads the model and tokenizer and runs generation with configurable length and sampling.
13
+
14
+ ## Dataset
15
+
16
+ - Training (optional): datasets like **CodeXGlue** code-to-text, or **DocstringGeneration**-style data from Hugging Face Datasets.
17
+ - For inference only, no dataset is required; use pre-trained weights.
18
+
19
+ ## Usage
20
+
21
+ ```bash
22
+ pip install -r requirements.txt
23
+ python inference.py --input "def add(a, b): return a + b"
24
+ ```
25
+
26
+ For a quick demo in the browser, run the Gradio app:
27
+
28
+ ```bash
29
+ python app.py
30
+ ```
31
+
32
+ ## Example
33
+
34
+ Input:
35
+ ```python
36
+ def factorial(n):
37
+ if n <= 1:
38
+ return 1
39
+ return n * factorial(n - 1)
40
+ ```
41
+
42
+ Output (example): `"Compute the factorial of n recursively."`
43
+
44
+ ## Files
45
+
46
+ - `inference.py` — loads T5 (or CodeT5), runs generation; can take a file path or inline code.
47
+ - `app.py` — Gradio UI for pasting code and getting a docstring.
48
+
49
+ ## Limitations / future work
50
+
51
+ - Quality depends on the base model and any fine-tuning; out-of-domain code may get generic descriptions.
52
+ - Could be extended to multi-line docstrings or different styles (Google, NumPy, Sphinx).
53
+
54
+ ## Author
55
+
56
+ **Alireza Aminzadeh**
57
+ - Email: [alireza.aminzadeh@hotmail.com](mailto:alireza.aminzadeh@hotmail.com)
58
+ - Hugging Face: [syeedalireza](https://huggingface.co/syeedalireza)
59
+ - LinkedIn: [alirezaaminzadeh](https://www.linkedin.com/in/alirezaaminzadeh)
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minimal Gradio app for docstring generation.
3
+ Run: python app.py
4
+ """
5
+
6
+ import gradio as gr
7
+ from inference import generate_docstring
8
+
9
+
10
+ def summarize_code(code: str) -> str:
11
+ if not code or not code.strip():
12
+ return "Paste a Python code snippet above."
13
+ return generate_docstring(code, model_name="t5-small", max_length=128, num_beams=4)
14
+
15
+
16
+ demo = gr.Interface(
17
+ fn=summarize_code,
18
+ inputs=gr.Textbox(
19
+ label="Python code",
20
+ placeholder="def add(a, b):\n return a + b",
21
+ lines=8,
22
+ ),
23
+ outputs=gr.Textbox(label="Generated docstring"),
24
+ title="Python Docstring Generator",
25
+ description="Paste a Python function or snippet to get a short docstring summary.",
26
+ )
27
+
28
+ if __name__ == "__main__":
29
+ demo.launch()
inference.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference script for docstring generation from Python code.
3
+ Uses Hugging Face Transformers (T5 or CodeT5).
4
+ """
5
+
6
+ import argparse
7
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
8
+ import torch
9
+
10
+
11
+ def generate_docstring(
12
+ code: str,
13
+ model_name: str = "t5-small",
14
+ max_length: int = 128,
15
+ num_beams: int = 4,
16
+ device: str = None,
17
+ ) -> str:
18
+ if device is None:
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
23
+
24
+ # T5 expects a prefix for the task; we use "summarize:" for generic text/code summary
25
+ input_text = "summarize: " + code
26
+ inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
27
+
28
+ with torch.no_grad():
29
+ out = model.generate(
30
+ **inputs,
31
+ max_length=max_length,
32
+ num_beams=num_beams,
33
+ early_stopping=True,
34
+ )
35
+
36
+ return tokenizer.decode(out[0], skip_special_tokens=True)
37
+
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser()
41
+ parser.add_argument("--input", type=str, required=True, help="Python code snippet (or path to file)")
42
+ parser.add_argument("--model_name", type=str, default="t5-small")
43
+ parser.add_argument("--max_length", type=int, default=128)
44
+ parser.add_argument("--num_beams", type=int, default=4)
45
+ args = parser.parse_args()
46
+
47
+ code = args.input
48
+ if len(code) < 260 and code.endswith(".py"):
49
+ try:
50
+ with open(code, "r") as f:
51
+ code = f.read()
52
+ except Exception:
53
+ pass
54
+
55
+ docstring = generate_docstring(
56
+ code,
57
+ model_name=args.model_name,
58
+ max_length=args.max_length,
59
+ num_beams=args.num_beams,
60
+ )
61
+ print(docstring)
62
+
63
+
64
+ if __name__ == "__main__":
65
+ main()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.30.0
3
+ datasets>=2.12.0
4
+ gradio>=4.0.0