File size: 9,783 Bytes
c21b56f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/env python3
"""
instruction_generation unified entry script
Supports --mode summarize|parse|all
"""

import os
import sys
import asyncio
import argparse
from pathlib import Path
from dotenv import load_dotenv

# Load .env file (before importing logger)
env_file = Path(__file__).parent / ".env"
if env_file.exists():
    load_dotenv(env_file)
elif (Path(__file__).parent.parent / ".env").exists():
    # If not in current directory, try loading from project root
    load_dotenv(Path(__file__).parent.parent / ".env")

# Add current directory to path (for importing local modules)
sys.path.insert(0, str(Path(__file__).parent))
# Add domain_code/src to path for reusing util functions
sys.path.insert(0, str(Path(__file__).parent.parent / "domain_code" / "src"))
from util import init_logger, logger


# Import modules
from summarize_repo_readme import process_all_repos as process_summarize
from extract_repo_functions import process_all_repos as process_extract


async def main():
    """Main function"""
    parser = argparse.ArgumentParser(
        description="instruction_generation unified entry tool",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Full pipeline: summarize README first, then parse functions
  python3 pipeline.py --mode all

  # Summarize README only
  python3 pipeline.py --mode summarize

  # Parse functions only (requires README_SUMMARY.md to exist)
  python3 pipeline.py --mode parse

  # Use local vLLM Qwen (default)
  python3 pipeline.py --mode all

  # Use OpenAI API
  export OPENAI_API_KEY="your-api-key"
  python3 pipeline.py --mode all --base_url https://api.openai.com/v1 --model gpt-4o-mini

  # Specify repository directory and other parameters
  python3 pipeline.py --mode all --repos_dir /path/to/repos_filtered --max_concurrency 16 --overwrite
        """,
    )
    
    # Common parameters
    parser.add_argument(
        "--repos_dir",
        type=str,
        default=os.getenv("REPOS_DIR", "/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered"),
        help="Repository root directory path (can be read from REPOS_DIR env var)",
    )
    parser.add_argument(
        "--mode",
        type=str,
        choices=["summarize", "parse", "all"],
        default="all",
        help="Execution mode: 'summarize' (README only), 'parse' (functions only), 'all' (full pipeline)",
    )
    
    # LLM parameters
    parser.add_argument(
        "--base_url",
        type=str,
        default=os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1"),
        help="LLM API base URL (can be read from OPENAI_BASE_URL env var, default: http://localhost:8000/v1)",
    )
    parser.add_argument(
        "--model",
        type=str,
        default=os.getenv("DEFAULT_MODEL", "Qwen3"),
        help="Model name (can be read from DEFAULT_MODEL env var, default: Qwen3)",
    )
    parser.add_argument(
        "--api_key_env",
        type=str,
        default="OPENAI_API_KEY",
        help="API key environment variable name (default: OPENAI_API_KEY)",
    )
    
    # Performance parameters
    parser.add_argument(
        "--max_concurrency",
        type=int,
        default=int(os.getenv("MAX_CONCURRENCY", "8")),
        help="Maximum concurrency (can be read from MAX_CONCURRENCY env var, default: 8)",
    )
    parser.add_argument(
        "--max_file_chars",
        type=int,
        default=int(os.getenv("MAX_FILE_CHARS", "200000")),
        help="Maximum file size (chars, for parse mode only, can be read from MAX_FILE_CHARS env var, default: 200000)",
    )
    
    # Other parameters
    parser.add_argument(
        "--overwrite",
        action="store_true",
        help="Overwrite existing files",
    )
    parser.add_argument(
        "--log_file",
        type=str,
        default="instruction_generation/workdir/logs/pipeline.log",
        help="Log file path",
    )
    
    args = parser.parse_args()
    
    # Initialize logger
    init_logger(args.log_file, level="INFO")
    
    # Get API key (with debug logging)
    # region agent log
    env_key_before = os.getenv(args.api_key_env)
    debug_log_path = Path(__file__).parent.parent / ".cursor" / "debug.log"
    try:
        with open(debug_log_path, "a", encoding="utf-8") as f:
            import json
            log_entry = {
                "sessionId": "debug-session",
                "runId": "api-key-debug",
                "hypothesisId": "A",
                "location": "pipeline.py:130",
                "message": "API key read from env",
                "data": {
                    "env_var_name": args.api_key_env,
                    "key_exists": env_key_before is not None,
                    "key_length": len(env_key_before) if env_key_before else 0,
                    "key_prefix": env_key_before[:20] + "..." if env_key_before and len(env_key_before) > 20 else env_key_before,
                },
                "timestamp": int(__import__("time").time() * 1000)
            }
            f.write(json.dumps(log_entry) + "\n")
    except Exception:
        pass
    # endregion
    
    api_key = os.getenv(args.api_key_env, "none")
    
    # region agent log
    try:
        with open(debug_log_path, "a", encoding="utf-8") as f:
            log_entry = {
                "sessionId": "debug-session",
                "runId": "api-key-debug",
                "hypothesisId": "A",
                "location": "pipeline.py:150",
                "message": "API key final value",
                "data": {
                    "api_key_length": len(api_key) if api_key else 0,
                    "api_key_prefix": api_key[:20] + "..." if api_key and len(api_key) > 20 else api_key,
                    "api_key_suffix": "..." + api_key[-10:] if api_key and len(api_key) > 10 else api_key,
                    "is_default_none": api_key == "none",
                },
                "timestamp": int(__import__("time").time() * 1000)
            }
            f.write(json.dumps(log_entry) + "\n")
    except Exception:
        pass
    # endregion
    
    # Check repository directory
    repos_dir = Path(args.repos_dir)
    if not repos_dir.exists():
        logger.error(f"Repository directory does not exist: {repos_dir}")
        sys.exit(1)
    
    # Create log directory
    log_file_path = Path(args.log_file)
    log_file_path.parent.mkdir(parents=True, exist_ok=True)
    
    logger.info("=" * 80)
    logger.info(f"instruction_generation tool started")
    logger.info("=" * 80)
    logger.info(f"Mode: {args.mode}")
    logger.info(f"Repository directory: {repos_dir}")
    logger.info(f"LLM API: {args.base_url}")
    logger.info(f"Model: {args.model}")
    logger.info(f"Max concurrency: {args.max_concurrency}")
    logger.info(f"Overwrite existing files: {args.overwrite}")
    logger.info("=" * 80)
    
    # Execute based on mode
    if args.mode == "summarize":
        # Summarize README only
        logger.info("Starting: README summarization")
        results = await process_summarize(
            repos_dir=repos_dir,
            base_url=args.base_url,
            model=args.model,
            api_key=api_key,
            log_file=str(log_file_path),
            max_concurrency=args.max_concurrency,
            overwrite=args.overwrite,
        )
        
        logger.info("\n" + "=" * 80)
        logger.info("README summarization complete!")
        logger.info("=" * 80)
        
    elif args.mode == "parse":
        # Parse functions only
        logger.info("Starting: Function parsing")
        results = await process_extract(
            repos_dir=repos_dir,
            base_url=args.base_url,
            model=args.model,
            api_key=api_key,
            log_file=str(log_file_path),
            max_file_chars=args.max_file_chars,
            max_concurrency=args.max_concurrency,
            overwrite=args.overwrite,
        )
        
        logger.info("\n" + "=" * 80)
        logger.info("Function parsing complete!")
        logger.info("=" * 80)
        
    elif args.mode == "all":
        # Full pipeline: summarize README first, then parse functions
        logger.info("Starting: Full pipeline")
        logger.info("\n" + "-" * 80)
        logger.info("Step 1/2: README summarization")
        logger.info("-" * 80)
        
        summarize_results = await process_summarize(
            repos_dir=repos_dir,
            base_url=args.base_url,
            model=args.model,
            api_key=api_key,
            log_file=str(log_file_path),
            max_concurrency=args.max_concurrency,
            overwrite=args.overwrite,
        )
        
        logger.info("\n" + "-" * 80)
        logger.info("Step 2/2: Function parsing")
        logger.info("-" * 80)
        
        # When parsing functions, if README_SUMMARY.md doesn't exist, overwrite=False will auto-skip
        # But if user requests overwrite, it will try to parse even without README_SUMMARY.md (will skip repos without summary)
        parse_results = await process_extract(
            repos_dir=repos_dir,
            base_url=args.base_url,
            model=args.model,
            api_key=api_key,
            log_file=str(log_file_path),
            max_file_chars=args.max_file_chars,
            max_concurrency=args.max_concurrency,
            overwrite=args.overwrite,
        )
        
        logger.info("\n" + "=" * 80)
        logger.info("Full pipeline complete!")
        logger.info("=" * 80)
        
        results = parse_results
    else:
        logger.error(f"Unknown mode: {args.mode}")
        sys.exit(1)
    
    logger.info("\n" + "=" * 80)
    logger.info("All tasks complete!")
    logger.info("=" * 80)


if __name__ == "__main__":
    asyncio.run(main())