DouDou commited on
Commit
c21b56f
·
verified ·
1 Parent(s): 432dc67

Upload data2/instruction_generation/pipeline.py with huggingface_hub

Browse files
data2/instruction_generation/pipeline.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ instruction_generation unified entry script
4
+ Supports --mode summarize|parse|all
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import asyncio
10
+ import argparse
11
+ from pathlib import Path
12
+ from dotenv import load_dotenv
13
+
14
+ # Load .env file (before importing logger)
15
+ env_file = Path(__file__).parent / ".env"
16
+ if env_file.exists():
17
+ load_dotenv(env_file)
18
+ elif (Path(__file__).parent.parent / ".env").exists():
19
+ # If not in current directory, try loading from project root
20
+ load_dotenv(Path(__file__).parent.parent / ".env")
21
+
22
+ # Add current directory to path (for importing local modules)
23
+ sys.path.insert(0, str(Path(__file__).parent))
24
+ # Add domain_code/src to path for reusing util functions
25
+ sys.path.insert(0, str(Path(__file__).parent.parent / "domain_code" / "src"))
26
+ from util import init_logger, logger
27
+
28
+
29
+ # Import modules
30
+ from summarize_repo_readme import process_all_repos as process_summarize
31
+ from extract_repo_functions import process_all_repos as process_extract
32
+
33
+
34
+ async def main():
35
+ """Main function"""
36
+ parser = argparse.ArgumentParser(
37
+ description="instruction_generation unified entry tool",
38
+ formatter_class=argparse.RawDescriptionHelpFormatter,
39
+ epilog="""
40
+ Examples:
41
+ # Full pipeline: summarize README first, then parse functions
42
+ python3 pipeline.py --mode all
43
+
44
+ # Summarize README only
45
+ python3 pipeline.py --mode summarize
46
+
47
+ # Parse functions only (requires README_SUMMARY.md to exist)
48
+ python3 pipeline.py --mode parse
49
+
50
+ # Use local vLLM Qwen (default)
51
+ python3 pipeline.py --mode all
52
+
53
+ # Use OpenAI API
54
+ export OPENAI_API_KEY="your-api-key"
55
+ python3 pipeline.py --mode all --base_url https://api.openai.com/v1 --model gpt-4o-mini
56
+
57
+ # Specify repository directory and other parameters
58
+ python3 pipeline.py --mode all --repos_dir /path/to/repos_filtered --max_concurrency 16 --overwrite
59
+ """,
60
+ )
61
+
62
+ # Common parameters
63
+ parser.add_argument(
64
+ "--repos_dir",
65
+ type=str,
66
+ default=os.getenv("REPOS_DIR", "/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered"),
67
+ help="Repository root directory path (can be read from REPOS_DIR env var)",
68
+ )
69
+ parser.add_argument(
70
+ "--mode",
71
+ type=str,
72
+ choices=["summarize", "parse", "all"],
73
+ default="all",
74
+ help="Execution mode: 'summarize' (README only), 'parse' (functions only), 'all' (full pipeline)",
75
+ )
76
+
77
+ # LLM parameters
78
+ parser.add_argument(
79
+ "--base_url",
80
+ type=str,
81
+ default=os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1"),
82
+ help="LLM API base URL (can be read from OPENAI_BASE_URL env var, default: http://localhost:8000/v1)",
83
+ )
84
+ parser.add_argument(
85
+ "--model",
86
+ type=str,
87
+ default=os.getenv("DEFAULT_MODEL", "Qwen3"),
88
+ help="Model name (can be read from DEFAULT_MODEL env var, default: Qwen3)",
89
+ )
90
+ parser.add_argument(
91
+ "--api_key_env",
92
+ type=str,
93
+ default="OPENAI_API_KEY",
94
+ help="API key environment variable name (default: OPENAI_API_KEY)",
95
+ )
96
+
97
+ # Performance parameters
98
+ parser.add_argument(
99
+ "--max_concurrency",
100
+ type=int,
101
+ default=int(os.getenv("MAX_CONCURRENCY", "8")),
102
+ help="Maximum concurrency (can be read from MAX_CONCURRENCY env var, default: 8)",
103
+ )
104
+ parser.add_argument(
105
+ "--max_file_chars",
106
+ type=int,
107
+ default=int(os.getenv("MAX_FILE_CHARS", "200000")),
108
+ help="Maximum file size (chars, for parse mode only, can be read from MAX_FILE_CHARS env var, default: 200000)",
109
+ )
110
+
111
+ # Other parameters
112
+ parser.add_argument(
113
+ "--overwrite",
114
+ action="store_true",
115
+ help="Overwrite existing files",
116
+ )
117
+ parser.add_argument(
118
+ "--log_file",
119
+ type=str,
120
+ default="instruction_generation/workdir/logs/pipeline.log",
121
+ help="Log file path",
122
+ )
123
+
124
+ args = parser.parse_args()
125
+
126
+ # Initialize logger
127
+ init_logger(args.log_file, level="INFO")
128
+
129
+ # Get API key (with debug logging)
130
+ # region agent log
131
+ env_key_before = os.getenv(args.api_key_env)
132
+ debug_log_path = Path(__file__).parent.parent / ".cursor" / "debug.log"
133
+ try:
134
+ with open(debug_log_path, "a", encoding="utf-8") as f:
135
+ import json
136
+ log_entry = {
137
+ "sessionId": "debug-session",
138
+ "runId": "api-key-debug",
139
+ "hypothesisId": "A",
140
+ "location": "pipeline.py:130",
141
+ "message": "API key read from env",
142
+ "data": {
143
+ "env_var_name": args.api_key_env,
144
+ "key_exists": env_key_before is not None,
145
+ "key_length": len(env_key_before) if env_key_before else 0,
146
+ "key_prefix": env_key_before[:20] + "..." if env_key_before and len(env_key_before) > 20 else env_key_before,
147
+ },
148
+ "timestamp": int(__import__("time").time() * 1000)
149
+ }
150
+ f.write(json.dumps(log_entry) + "\n")
151
+ except Exception:
152
+ pass
153
+ # endregion
154
+
155
+ api_key = os.getenv(args.api_key_env, "none")
156
+
157
+ # region agent log
158
+ try:
159
+ with open(debug_log_path, "a", encoding="utf-8") as f:
160
+ log_entry = {
161
+ "sessionId": "debug-session",
162
+ "runId": "api-key-debug",
163
+ "hypothesisId": "A",
164
+ "location": "pipeline.py:150",
165
+ "message": "API key final value",
166
+ "data": {
167
+ "api_key_length": len(api_key) if api_key else 0,
168
+ "api_key_prefix": api_key[:20] + "..." if api_key and len(api_key) > 20 else api_key,
169
+ "api_key_suffix": "..." + api_key[-10:] if api_key and len(api_key) > 10 else api_key,
170
+ "is_default_none": api_key == "none",
171
+ },
172
+ "timestamp": int(__import__("time").time() * 1000)
173
+ }
174
+ f.write(json.dumps(log_entry) + "\n")
175
+ except Exception:
176
+ pass
177
+ # endregion
178
+
179
+ # Check repository directory
180
+ repos_dir = Path(args.repos_dir)
181
+ if not repos_dir.exists():
182
+ logger.error(f"Repository directory does not exist: {repos_dir}")
183
+ sys.exit(1)
184
+
185
+ # Create log directory
186
+ log_file_path = Path(args.log_file)
187
+ log_file_path.parent.mkdir(parents=True, exist_ok=True)
188
+
189
+ logger.info("=" * 80)
190
+ logger.info(f"instruction_generation tool started")
191
+ logger.info("=" * 80)
192
+ logger.info(f"Mode: {args.mode}")
193
+ logger.info(f"Repository directory: {repos_dir}")
194
+ logger.info(f"LLM API: {args.base_url}")
195
+ logger.info(f"Model: {args.model}")
196
+ logger.info(f"Max concurrency: {args.max_concurrency}")
197
+ logger.info(f"Overwrite existing files: {args.overwrite}")
198
+ logger.info("=" * 80)
199
+
200
+ # Execute based on mode
201
+ if args.mode == "summarize":
202
+ # Summarize README only
203
+ logger.info("Starting: README summarization")
204
+ results = await process_summarize(
205
+ repos_dir=repos_dir,
206
+ base_url=args.base_url,
207
+ model=args.model,
208
+ api_key=api_key,
209
+ log_file=str(log_file_path),
210
+ max_concurrency=args.max_concurrency,
211
+ overwrite=args.overwrite,
212
+ )
213
+
214
+ logger.info("\n" + "=" * 80)
215
+ logger.info("README summarization complete!")
216
+ logger.info("=" * 80)
217
+
218
+ elif args.mode == "parse":
219
+ # Parse functions only
220
+ logger.info("Starting: Function parsing")
221
+ results = await process_extract(
222
+ repos_dir=repos_dir,
223
+ base_url=args.base_url,
224
+ model=args.model,
225
+ api_key=api_key,
226
+ log_file=str(log_file_path),
227
+ max_file_chars=args.max_file_chars,
228
+ max_concurrency=args.max_concurrency,
229
+ overwrite=args.overwrite,
230
+ )
231
+
232
+ logger.info("\n" + "=" * 80)
233
+ logger.info("Function parsing complete!")
234
+ logger.info("=" * 80)
235
+
236
+ elif args.mode == "all":
237
+ # Full pipeline: summarize README first, then parse functions
238
+ logger.info("Starting: Full pipeline")
239
+ logger.info("\n" + "-" * 80)
240
+ logger.info("Step 1/2: README summarization")
241
+ logger.info("-" * 80)
242
+
243
+ summarize_results = await process_summarize(
244
+ repos_dir=repos_dir,
245
+ base_url=args.base_url,
246
+ model=args.model,
247
+ api_key=api_key,
248
+ log_file=str(log_file_path),
249
+ max_concurrency=args.max_concurrency,
250
+ overwrite=args.overwrite,
251
+ )
252
+
253
+ logger.info("\n" + "-" * 80)
254
+ logger.info("Step 2/2: Function parsing")
255
+ logger.info("-" * 80)
256
+
257
+ # When parsing functions, if README_SUMMARY.md doesn't exist, overwrite=False will auto-skip
258
+ # But if user requests overwrite, it will try to parse even without README_SUMMARY.md (will skip repos without summary)
259
+ parse_results = await process_extract(
260
+ repos_dir=repos_dir,
261
+ base_url=args.base_url,
262
+ model=args.model,
263
+ api_key=api_key,
264
+ log_file=str(log_file_path),
265
+ max_file_chars=args.max_file_chars,
266
+ max_concurrency=args.max_concurrency,
267
+ overwrite=args.overwrite,
268
+ )
269
+
270
+ logger.info("\n" + "=" * 80)
271
+ logger.info("Full pipeline complete!")
272
+ logger.info("=" * 80)
273
+
274
+ results = parse_results
275
+ else:
276
+ logger.error(f"Unknown mode: {args.mode}")
277
+ sys.exit(1)
278
+
279
+ logger.info("\n" + "=" * 80)
280
+ logger.info("All tasks complete!")
281
+ logger.info("=" * 80)
282
+
283
+
284
+ if __name__ == "__main__":
285
+ asyncio.run(main())