wuhp commited on
Commit
a623051
Β·
verified Β·
1 Parent(s): f6f8a40

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1036 -0
app.py ADDED
@@ -0,0 +1,1036 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RepoForge – Agentic Repo Conversion App
3
+ ========================================
4
+ β€’ Multi-key Gemini rotation (auto-switches when quota hits)
5
+ β€’ RAG over source repo (chunked, on-demand, never fully loaded)
6
+ β€’ Creates a HF Space (Docker preset), generates Dockerfile + vite.config.js
7
+ β€’ Surgical patch loop: read build logs β†’ apply unified diffs β†’ never full rewrites
8
+ β€’ Streams progress back to Gradio UI in real-time
9
+ """
10
+
11
+ import gradio as gr
12
+ import google.generativeai as genai
13
+ from huggingface_hub import HfApi, SpaceHardware
14
+ import os, re, json, time, math, hashlib, tempfile, subprocess, textwrap
15
+ from pathlib import Path
16
+ from dataclasses import dataclass, field
17
+ from typing import Generator, Optional
18
+ import threading
19
+
20
+ # ─── Gemini Key Rotator ───────────────────────────────────────────────────────
21
+
22
+ class GeminiRotator:
23
+ """Round-robin across multiple Gemini API keys; swaps on quota errors."""
24
+
25
+ QUOTA_ERRORS = ("429", "quota", "rate", "exhausted", "resource_exhausted")
26
+
27
+ def __init__(self, keys: list[str]):
28
+ self.keys = [k.strip() for k in keys if k.strip()]
29
+ self.idx = 0
30
+ self.lock = threading.Lock()
31
+ if not self.keys:
32
+ raise ValueError("At least one Gemini API key is required.")
33
+
34
+ def current_key(self) -> str:
35
+ return self.keys[self.idx % len(self.keys)]
36
+
37
+ def rotate(self):
38
+ with self.lock:
39
+ self.idx = (self.idx + 1) % len(self.keys)
40
+
41
+ def generate(self, system: str, prompt: str, max_tokens: int = 8192) -> str:
42
+ """Try each key once; raise if all exhausted."""
43
+ for attempt in range(len(self.keys)):
44
+ key = self.current_key()
45
+ try:
46
+ genai.configure(api_key=key)
47
+ model = genai.GenerativeModel(
48
+ model_name="gemini-1.5-pro",
49
+ system_instruction=system,
50
+ )
51
+ resp = model.generate_content(
52
+ prompt,
53
+ generation_config=genai.types.GenerationConfig(
54
+ max_output_tokens=max_tokens,
55
+ temperature=0.2,
56
+ ),
57
+ )
58
+ return resp.text
59
+ except Exception as e:
60
+ err = str(e).lower()
61
+ if any(q in err for q in self.QUOTA_ERRORS):
62
+ self.rotate()
63
+ continue
64
+ raise
65
+ raise RuntimeError("All Gemini API keys exhausted or errored.")
66
+
67
+
68
+ # ─── Repo RAG Index ───────────────────────────────────────────────────────────
69
+
70
+ @dataclass
71
+ class FileChunk:
72
+ path: str
73
+ start_line: int
74
+ end_line: int
75
+ content: str
76
+ tokens: int # rough estimate
77
+
78
+ def header(self) -> str:
79
+ return f"### {self.path} (lines {self.start_line}-{self.end_line})"
80
+
81
+
82
+ class RepoRAG:
83
+ """
84
+ Lightweight RAG over a local repo directory.
85
+ Files are chunked with configurable size (300–1200 lines) and a 30-line overlap
86
+ so context is never lost at chunk boundaries.
87
+ On query we return the top-k most relevant chunks (never the whole repo).
88
+ """
89
+
90
+ MIN_CHUNK_LINES = 300
91
+ MAX_CHUNK_LINES = 1200
92
+ OVERLAP_LINES = 30 # lines shared between adjacent chunks
93
+ IGNORE_DIRS = {".git", "node_modules", "__pycache__", ".venv", "dist", "build"}
94
+ TEXT_EXTS = {
95
+ ".py", ".js", ".ts", ".jsx", ".tsx", ".rs", ".go", ".java", ".c", ".cpp",
96
+ ".h", ".hpp", ".cs", ".rb", ".php", ".swift", ".kt", ".toml", ".yaml",
97
+ ".yml", ".json", ".md", ".txt", ".sh", ".bash", ".dockerfile", ".env",
98
+ ".html", ".css", ".scss", ".sql", ".graphql",
99
+ }
100
+
101
+ def __init__(self, repo_path: str, chunk_lines: int = 600):
102
+ self.repo_path = Path(repo_path)
103
+ self.chunk_lines = max(self.MIN_CHUNK_LINES, min(self.MAX_CHUNK_LINES, chunk_lines))
104
+ self.chunks: list[FileChunk] = []
105
+ self.index: dict[str, list[int]] = {} # token β†’ chunk indices
106
+ self._build()
107
+
108
+ def _build(self):
109
+ step = self.chunk_lines - self.OVERLAP_LINES # stride with overlap
110
+ for fpath in self.repo_path.rglob("*"):
111
+ if fpath.is_dir():
112
+ continue
113
+ if any(p in fpath.parts for p in self.IGNORE_DIRS):
114
+ continue
115
+ if fpath.suffix.lower() not in self.TEXT_EXTS:
116
+ continue
117
+ try:
118
+ lines = fpath.read_text(errors="replace").splitlines()
119
+ except Exception:
120
+ continue
121
+ rel = str(fpath.relative_to(self.repo_path))
122
+ total = max(len(lines), 1)
123
+ for i in range(0, total, step):
124
+ chunk_lines = lines[i : i + self.chunk_lines]
125
+ content = "\n".join(chunk_lines)
126
+ chunk = FileChunk(
127
+ path=rel,
128
+ start_line=i + 1,
129
+ end_line=i + len(chunk_lines),
130
+ content=content,
131
+ tokens=len(content) // 4, # rough char/4 β‰ˆ tokens
132
+ )
133
+ self.chunks.append(chunk)
134
+ # index words
135
+ for word in re.findall(r"[a-zA-Z_]\w{2,}", content.lower()):
136
+ self.index.setdefault(word, []).append(len(self.chunks) - 1)
137
+
138
+ def query(self, query: str, top_k: int = 6, budget_tokens: int = 12000) -> str:
139
+ """Return top_k relevant chunks as a formatted string, within token budget."""
140
+ words = re.findall(r"[a-zA-Z_]\w{2,}", query.lower())
141
+ scores: dict[int, int] = {}
142
+ for w in words:
143
+ for idx in self.index.get(w, []):
144
+ scores[idx] = scores.get(idx, 0) + 1
145
+ ranked = sorted(scores, key=lambda i: -scores[i])[:top_k]
146
+ # Pad with first chunks if nothing matched
147
+ for i in range(len(self.chunks)):
148
+ if len(ranked) >= top_k:
149
+ break
150
+ if i not in ranked:
151
+ ranked.append(i)
152
+
153
+ out, used = [], 0
154
+ for idx in ranked:
155
+ c = self.chunks[idx]
156
+ if used + c.tokens > budget_tokens:
157
+ break
158
+ out.append(f"{c.header()}\n```\n{c.content}\n```")
159
+ used += c.tokens
160
+ return "\n\n".join(out) if out else "(no relevant source chunks found)"
161
+
162
+ def file_tree(self, max_lines: int = 200) -> str:
163
+ paths = sorted(set(c.path for c in self.chunks))
164
+ lines = paths[:max_lines]
165
+ if len(paths) > max_lines:
166
+ lines.append(f"... and {len(paths) - max_lines} more files")
167
+ return "\n".join(lines)
168
+
169
+ def get_file(self, rel_path: str) -> Optional[str]:
170
+ """Return full content of a specific file."""
171
+ target = Path(rel_path)
172
+ full = self.repo_path / target
173
+ if full.exists():
174
+ try:
175
+ return full.read_text(errors="replace")
176
+ except Exception:
177
+ pass
178
+ return None
179
+
180
+
181
+ # ─── HuggingFace Space Manager ────────────────────────────────────────────────
182
+
183
+ class SpaceManager:
184
+ def __init__(self, hf_token: str):
185
+ self.api = HfApi(token=hf_token)
186
+ self.token = hf_token
187
+ self.space_id: Optional[str] = None
188
+
189
+ def create_space(self, namespace: str, name: str) -> str:
190
+ """Create a Docker-preset HF Space. Returns repo_id."""
191
+ repo_id = f"{namespace}/{name}"
192
+ self.api.create_repo(
193
+ repo_id=repo_id,
194
+ repo_type="space",
195
+ space_sdk="docker",
196
+ exist_ok=True,
197
+ private=False,
198
+ )
199
+ self.space_id = repo_id
200
+ return repo_id
201
+
202
+ def upload_file(self, local_path: str, repo_path: str):
203
+ if not self.space_id:
204
+ raise RuntimeError("No space created yet.")
205
+ self.api.upload_file(
206
+ path_or_fileobj=local_path,
207
+ path_in_repo=repo_path,
208
+ repo_id=self.space_id,
209
+ repo_type="space",
210
+ )
211
+
212
+ def upload_content(self, content: str, repo_path: str):
213
+ """Upload string content directly."""
214
+ with tempfile.NamedTemporaryFile(mode="w", suffix=Path(repo_path).suffix,
215
+ delete=False) as f:
216
+ f.write(content)
217
+ tmp = f.name
218
+ self.upload_file(tmp, repo_path)
219
+ os.unlink(tmp)
220
+
221
+ def get_logs(self, lines: int = 80) -> str:
222
+ """Fetch build logs from the Space."""
223
+ if not self.space_id:
224
+ return ""
225
+ try:
226
+ logs = self.api.get_space_runtime(self.space_id)
227
+ # HF SDK returns runtime info; fetch build logs via jobs endpoint
228
+ # We use the logs iterator
229
+ log_text = []
230
+ for entry in self.api.get_space_logs(self.space_id, lines=lines):
231
+ log_text.append(entry.get("text", ""))
232
+ return "\n".join(log_text)
233
+ except Exception as e:
234
+ return f"(could not fetch logs: {e})"
235
+
236
+ def space_url(self) -> str:
237
+ if not self.space_id:
238
+ return ""
239
+ return f"https://huggingface.co/spaces/{self.space_id}"
240
+
241
+
242
+ # ─── Patch Utilities ──────────────────────────────────────────────────────────
243
+
244
+ def apply_unified_diff(original: str, patch: str) -> str:
245
+ """Apply a unified diff string to original content. Falls back to whole-replace."""
246
+ with tempfile.TemporaryDirectory() as td:
247
+ orig_file = os.path.join(td, "original")
248
+ patch_file = os.path.join(td, "patch.diff")
249
+ Path(orig_file).write_text(original)
250
+ Path(patch_file).write_text(patch)
251
+ result = subprocess.run(
252
+ ["patch", "-u", orig_file, patch_file],
253
+ capture_output=True, text=True
254
+ )
255
+ if result.returncode == 0:
256
+ return Path(orig_file).read_text()
257
+ return original # patch failed, return unchanged
258
+
259
+
260
+ def extract_files_from_response(text: str) -> dict[str, str]:
261
+ """
262
+ Parse LLM response for fenced code blocks with filenames.
263
+ Supports: ```language path/to/file or ### file: path/to/file ```
264
+ """
265
+ files = {}
266
+ # Pattern 1: ```lang path/to/file\n...content...\n```
267
+ pattern1 = re.finditer(
268
+ r"```[\w]*\s+([\w./\-]+)\n(.*?)```",
269
+ text, re.DOTALL
270
+ )
271
+ for m in pattern1:
272
+ files[m.group(1).strip()] = m.group(2)
273
+
274
+ # Pattern 2: ### FILE: path\n```\ncontent\n```
275
+ pattern2 = re.finditer(
276
+ r"###\s+(?:FILE|file):\s*([\w./\-]+)\s*\n```[\w]*\n(.*?)```",
277
+ text, re.DOTALL
278
+ )
279
+ for m in pattern2:
280
+ files[m.group(1).strip()] = m.group(2)
281
+
282
+ return files
283
+
284
+
285
+ def extract_patches(text: str) -> dict[str, str]:
286
+ """Extract unified diff patches from LLM response."""
287
+ patches = {}
288
+ pattern = re.finditer(
289
+ r"###\s+PATCH:\s*([\w./\-]+)\s*\n```diff\n(.*?)```",
290
+ text, re.DOTALL
291
+ )
292
+ for m in pattern:
293
+ patches[m.group(1).strip()] = m.group(2)
294
+ return patches
295
+
296
+
297
+ # ─── File Continuation System ─────────────────────────────────────────────────
298
+
299
+ # Sentinel the model outputs when it runs out of tokens mid-file
300
+ INCOMPLETE_SENTINEL = "##INCOMPLETE##"
301
+
302
+ @dataclass
303
+ class IncompleteFile:
304
+ """Tracks a file the model started but didn't finish."""
305
+ path: str
306
+ content_so_far: str # everything written up to the cutoff
307
+ last_line: int # last line number written (1-indexed)
308
+ resume_hint: str # last ~20 lines to show the model for context
309
+
310
+
311
+ def detect_incomplete(files: dict[str, str]) -> list[IncompleteFile]:
312
+ """
313
+ Scan parsed files for the INCOMPLETE sentinel.
314
+ Returns a list of files that need continuation.
315
+ """
316
+ incomplete = []
317
+ for path, content in files.items():
318
+ if INCOMPLETE_SENTINEL in content:
319
+ # Strip the sentinel itself
320
+ clean = content[: content.index(INCOMPLETE_SENTINEL)].rstrip()
321
+ lines = clean.splitlines()
322
+ tail = "\n".join(lines[-20:]) if len(lines) >= 20 else clean
323
+ incomplete.append(IncompleteFile(
324
+ path=path,
325
+ content_so_far=clean,
326
+ last_line=len(lines),
327
+ resume_hint=tail,
328
+ ))
329
+ files[path] = clean # store what we have so far
330
+ return incomplete
331
+
332
+
333
+ def extract_continuation(text: str, inc: IncompleteFile) -> Optional[str]:
334
+ """
335
+ Pull the continuation content for a specific file from a CONTINUE response.
336
+ The model is instructed to emit:
337
+ ### CONTINUE: path/to/file
338
+ ```lang
339
+ <rest of file from where it left off>
340
+ ```
341
+ Returns the continuation content, or None if not found.
342
+ """
343
+ pattern = re.search(
344
+ rf"###\s+CONTINUE:\s*{re.escape(inc.path)}\s*\n```[\w]*\n(.*?)```",
345
+ text, re.DOTALL
346
+ )
347
+ if pattern:
348
+ return pattern.group(1)
349
+ # Fallback: bare fenced block
350
+ pattern2 = re.search(
351
+ rf"```[\w]*\s+{re.escape(inc.path)}\n(.*?)```",
352
+ text, re.DOTALL
353
+ )
354
+ return pattern2.group(1) if pattern2 else None
355
+
356
+
357
+ # ─── Prompts ──────────────────────────────────────────────────────────────────
358
+
359
+ SYSTEM_ARCHITECT = """You are RepoForge, an expert software architect and full-stack engineer.
360
+ You help users convert, rebuild, or transform code repositories by:
361
+ 1. Analyzing the source repo structure (provided via RAG chunks)
362
+ 2. Creating a HuggingFace Space with Docker preset
363
+ 3. Generating a production-grade Dockerfile and vite.config.js
364
+ 4. Iteratively patching code based on build logs β€” NEVER fully regenerating files after initial creation
365
+ 5. Making SURGICAL unified-diff patches to fix errors
366
+
367
+ FILE FORMAT RULES (strictly follow):
368
+ - New files: fenced block with path on the opening line:
369
+ ```typescript src/main.ts
370
+ <content>
371
+ ```
372
+ - Patches: ### PATCH: src/main.ts then ```diff with unified diff syntax
373
+ - Always prefer patches over full file rewrites after initial generation
374
+ - Keep responses focused; do NOT repeat unchanged file contents
375
+ - Files may be up to 1200 lines. Split larger logic into sub-modules.
376
+
377
+ CONTINUATION PROTOCOL (critical):
378
+ - If you cannot finish a file in one response, write as much as you can, then end the
379
+ fenced block with the literal token ##INCOMPLETE## on its own line before the closing ```.
380
+ - Example:
381
+ ```rust src/handlers.rs
382
+ // ... all content you managed to write ...
383
+ ##INCOMPLETE##
384
+ ```
385
+ - In the very next response (a CONTINUE prompt), resume from exactly where you left off.
386
+ - For continuations output:
387
+ ### CONTINUE: src/handlers.rs
388
+ ```rust
389
+ <rest of file, picking up at the exact next line>
390
+ ```
391
+ - Never re-emit lines already written. Overlap by at most 2 lines for context.
392
+ - A file is only considered complete when its fenced block closes WITHOUT ##INCOMPLETE##.
393
+
394
+ MEMORY RULES:
395
+ - Source repo context is provided as RAG chunks (not the full repo)
396
+ - Ask for more context by outputting NEED_CONTEXT: <query> on its own line
397
+ - Be precise about line numbers in diffs
398
+ """
399
+
400
+ PROMPT_INITIAL = """
401
+ ## Goal
402
+ {goal}
403
+
404
+ ## Source Repo File Tree
405
+ {file_tree}
406
+
407
+ ## Relevant Source Chunks (RAG)
408
+ {rag_chunks}
409
+
410
+ ## CodeConvert Agent Instructions (from HF Space)
411
+ {agents_md}
412
+
413
+ ## Task
414
+ 1. Generate a `Dockerfile` (Docker preset for HF Spaces β€” port 7860, non-root user)
415
+ 2. Generate a `vite.config.js` (or `vite.config.ts`) appropriate for this project
416
+ 3. Generate a `README.md` for the HF Space (title, description, sdk: docker, app_port: 7860)
417
+ 4. Begin converting/scaffolding the main entrypoint file(s) toward the goal
418
+ 5. Files may be up to 1200 lines; split larger logic into sub-modules
419
+ 6. If you run out of space mid-file, end that block with ##INCOMPLETE## β€” do NOT truncate silently
420
+
421
+ Output each file using the fenced block format. Use ##INCOMPLETE## if needed.
422
+ """
423
+
424
+ PROMPT_CONTINUE = """
425
+ ## Continuation Request
426
+
427
+ You previously started writing `{path}` but ran out of space at line {last_line}.
428
+
429
+ ## Last lines written (for context):
430
+ ```
431
+ {resume_hint}
432
+ ```
433
+
434
+ ## Goal (keep in mind)
435
+ {goal}
436
+
437
+ ## Task
438
+ Continue writing `{path}` from exactly where you left off (line {next_line}).
439
+ Output ONLY the continuation using:
440
+
441
+ ### CONTINUE: {path}
442
+ ```<lang>
443
+ <rest of file from line {next_line} onward>
444
+ ```
445
+
446
+ If you STILL cannot finish in this response, end again with ##INCOMPLETE##.
447
+ Do NOT re-emit lines already written (overlap max 2 lines for context is fine).
448
+ """
449
+
450
+ PROMPT_PATCH = """
451
+ ## Current Goal
452
+ {goal}
453
+
454
+ ## Build Logs (last 80 lines)
455
+ ```
456
+ {logs}
457
+ ```
458
+
459
+ ## Files Currently in Space
460
+ {file_list}
461
+
462
+ ## Relevant Source Chunks (RAG for context)
463
+ {rag_chunks}
464
+
465
+ ## Task
466
+ Analyze the build logs. Identify errors. Output ONLY surgical patches (unified diff) to fix them.
467
+ - Use ### PATCH: <filepath> then ```diff blocks
468
+ - Do NOT rewrite whole files
469
+ - If you need more source context, output NEED_CONTEXT: <specific query> on its own line
470
+ - If the build succeeded and goal is met, output: GOAL_COMPLETE
471
+ - If the build succeeded but goal isn't fully met, output next round of patches
472
+ - If a new file is needed (not previously created), use the normal fenced block format
473
+ - Use ##INCOMPLETE## if a new file can't fit in one response
474
+ """
475
+
476
+
477
+ # ─── Main Agent Loop ──────────────────────────────────────────────────────────
478
+
479
+ AGENTS_MD_FALLBACK = """
480
+ CodeConvert is an AI agent that converts codebases between programming languages.
481
+ It uses a RAG approach to reference the original repo, generates idiomatic target-language
482
+ code, and iteratively patches until the build passes. It focuses on:
483
+ - Preserving logic and architecture
484
+ - Generating idiomatic target-language code
485
+ - Surgical patches based on compiler/runtime errors
486
+ - Modular file structure
487
+ """
488
+
489
+ def run_agent(
490
+ hf_token: str,
491
+ gemini_keys_raw: str,
492
+ goal: str,
493
+ source_input: str, # github url, hf space url, or local folder path
494
+ space_name: str,
495
+ max_iterations: int = 8,
496
+ chunk_lines: int = 600,
497
+ ) -> Generator[str, None, None]:
498
+ """Main agentic loop. Yields log strings for Gradio streaming."""
499
+
500
+ def log(msg: str):
501
+ return msg + "\n"
502
+
503
+ yield log("πŸ”§ Initializing RepoForge...")
504
+
505
+ # Parse Gemini keys
506
+ gemini_keys = [k.strip() for k in re.split(r"[,\n]+", gemini_keys_raw) if k.strip()]
507
+ if not gemini_keys:
508
+ yield log("❌ No Gemini API keys provided.")
509
+ return
510
+
511
+ try:
512
+ rotator = GeminiRotator(gemini_keys)
513
+ except Exception as e:
514
+ yield log(f"❌ Gemini init error: {e}")
515
+ return
516
+
517
+ # Validate HF token
518
+ try:
519
+ api = HfApi(token=hf_token)
520
+ user = api.whoami()
521
+ namespace = user["name"]
522
+ yield log(f"βœ… HuggingFace authenticated as: {namespace}")
523
+ except Exception as e:
524
+ yield log(f"❌ HF token error: {e}")
525
+ return
526
+
527
+ # ── Clone / prepare source repo ──
528
+ with tempfile.TemporaryDirectory() as tmpdir:
529
+ repo_dir = os.path.join(tmpdir, "source_repo")
530
+
531
+ if source_input.startswith("https://github.com"):
532
+ yield log(f"πŸ“¦ Cloning GitHub repo: {source_input}")
533
+ result = subprocess.run(
534
+ ["git", "clone", "--depth=1", source_input, repo_dir],
535
+ capture_output=True, text=True
536
+ )
537
+ if result.returncode != 0:
538
+ yield log(f"❌ Git clone failed:\n{result.stderr}")
539
+ return
540
+ yield log("βœ… Repo cloned.")
541
+
542
+ elif "huggingface.co/spaces" in source_input:
543
+ # Extract space id from URL
544
+ m = re.search(r"huggingface\.co/spaces/([\w\-]+/[\w\-]+)", source_input)
545
+ if not m:
546
+ yield log("❌ Could not parse HF Space URL.")
547
+ return
548
+ space_id = m.group(1)
549
+ yield log(f"πŸ“¦ Cloning HF Space: {space_id}")
550
+ result = subprocess.run(
551
+ ["git", "clone", "--depth=1",
552
+ f"https://huggingface.co/spaces/{space_id}", repo_dir],
553
+ capture_output=True, text=True
554
+ )
555
+ if result.returncode != 0:
556
+ yield log(f"❌ HF Space clone failed:\n{result.stderr}")
557
+ return
558
+ yield log("βœ… HF Space cloned.")
559
+
560
+ elif os.path.isdir(source_input):
561
+ repo_dir = source_input
562
+ yield log(f"πŸ“ Using local folder: {repo_dir}")
563
+
564
+ else:
565
+ yield log("❌ Source must be a GitHub URL, HF Space URL, or local folder path.")
566
+ return
567
+
568
+ # ── Build RAG index ──
569
+ yield log("πŸ” Building RAG index over source repo...")
570
+ rag = RepoRAG(repo_dir, chunk_lines=chunk_lines)
571
+ yield log(f"βœ… Indexed {len(rag.chunks)} chunks from {len(set(c.path for c in rag.chunks))} files (chunk size: {rag.chunk_lines} lines).")
572
+
573
+ file_tree = rag.file_tree()
574
+
575
+ # ── Create HF Space ──
576
+ space_mgr = SpaceManager(hf_token)
577
+ safe_name = re.sub(r"[^a-zA-Z0-9\-]", "-", space_name.strip())[:50] or "repoforge-app"
578
+ yield log(f"πŸš€ Creating HF Space: {namespace}/{safe_name}")
579
+ try:
580
+ repo_id = space_mgr.create_space(namespace, safe_name)
581
+ yield log(f"βœ… Space created: {space_mgr.space_url()}")
582
+ except Exception as e:
583
+ yield log(f"❌ Failed to create space: {e}")
584
+ return
585
+
586
+ # ── Track files in space ──
587
+ space_files: dict[str, str] = {} # path β†’ content
588
+
589
+ # ── Initial generation ──
590
+ yield log("\nπŸ€– Asking Gemini to generate initial files...")
591
+
592
+ initial_rag = rag.query(goal, top_k=8, budget_tokens=14000)
593
+ initial_prompt = PROMPT_INITIAL.format(
594
+ goal=goal,
595
+ file_tree=file_tree[:3000],
596
+ rag_chunks=initial_rag,
597
+ agents_md=AGENTS_MD_FALLBACK,
598
+ )
599
+
600
+ try:
601
+ response = rotator.generate(SYSTEM_ARCHITECT, initial_prompt, max_tokens=8192)
602
+ except Exception as e:
603
+ yield log(f"❌ Gemini error: {e}")
604
+ return
605
+
606
+ # Parse files from response
607
+ new_files = extract_files_from_response(response)
608
+ if not new_files:
609
+ yield log("⚠️ Gemini didn't output any files. Showing raw response:")
610
+ yield log(response[:2000])
611
+ return
612
+
613
+ # ── Handle incomplete files from initial generation ──
614
+ pending_continuations = detect_incomplete(new_files)
615
+ if pending_continuations:
616
+ yield log(f"πŸ“ {len(pending_continuations)} file(s) incomplete β€” requesting continuations...")
617
+
618
+ while pending_continuations:
619
+ inc = pending_continuations.pop(0)
620
+ yield log(f" ↩️ Continuing {inc.path} from line {inc.last_line}...")
621
+ cont_prompt = PROMPT_CONTINUE.format(
622
+ path=inc.path,
623
+ last_line=inc.last_line,
624
+ next_line=inc.last_line + 1,
625
+ resume_hint=inc.resume_hint,
626
+ goal=goal,
627
+ )
628
+ try:
629
+ cont_response = rotator.generate(SYSTEM_ARCHITECT, cont_prompt, max_tokens=8192)
630
+ except Exception as e:
631
+ yield log(f" ❌ Continuation error for {inc.path}: {e}")
632
+ continue
633
+
634
+ continuation = extract_continuation(cont_response, inc)
635
+ if continuation:
636
+ # Check if this continuation is itself incomplete
637
+ if INCOMPLETE_SENTINEL in continuation:
638
+ clean_cont = continuation[: continuation.index(INCOMPLETE_SENTINEL)].rstrip()
639
+ appended = inc.content_so_far + "\n" + clean_cont
640
+ tail_lines = appended.splitlines()
641
+ tail = "\n".join(tail_lines[-20:])
642
+ pending_continuations.append(IncompleteFile(
643
+ path=inc.path,
644
+ content_so_far=appended,
645
+ last_line=len(appended.splitlines()),
646
+ resume_hint=tail,
647
+ ))
648
+ new_files[inc.path] = appended
649
+ yield log(f" ↩️ {inc.path} still incomplete at line {len(appended.splitlines())} β€” queuing another continuation")
650
+ else:
651
+ new_files[inc.path] = inc.content_so_far + "\n" + continuation
652
+ yield log(f" βœ… {inc.path} completed ({len(new_files[inc.path].splitlines())} lines total)")
653
+ else:
654
+ yield log(f" ⚠️ Could not parse continuation for {inc.path} β€” using partial content")
655
+
656
+ # Ensure we have required files
657
+ if "Dockerfile" not in new_files:
658
+ yield log("⚠️ No Dockerfile generated β€” adding minimal one.")
659
+ new_files["Dockerfile"] = textwrap.dedent("""
660
+ FROM node:20-slim
661
+ WORKDIR /app
662
+ COPY package*.json ./
663
+ RUN npm ci
664
+ COPY . .
665
+ RUN npm run build
666
+ EXPOSE 7860
667
+ CMD ["npm", "run", "preview", "--", "--port", "7860", "--host"]
668
+ """).strip()
669
+
670
+ if "README.md" not in new_files:
671
+ new_files["README.md"] = textwrap.dedent(f"""
672
+ ---
673
+ title: {safe_name}
674
+ emoji: πŸ”§
675
+ colorFrom: blue
676
+ colorTo: purple
677
+ sdk: docker
678
+ app_port: 7860
679
+ pinned: false
680
+ ---
681
+ # {safe_name}
682
+ Built with RepoForge.
683
+ """).strip()
684
+
685
+ # Upload all initial files
686
+ yield log(f"\nπŸ“€ Uploading {len(new_files)} initial files to HF Space...")
687
+ for fpath, content in new_files.items():
688
+ try:
689
+ space_mgr.upload_content(content, fpath)
690
+ space_files[fpath] = content
691
+ yield log(f" βœ… {fpath} ({len(content):,} chars)")
692
+ except Exception as e:
693
+ yield log(f" ❌ {fpath}: {e}")
694
+
695
+ yield log(f"\nπŸ”— Space URL: {space_mgr.space_url()}")
696
+ yield log("⏳ Waiting for initial build (60s)...")
697
+ time.sleep(60)
698
+
699
+ # ── Patch loop ──
700
+ for iteration in range(1, max_iterations + 1):
701
+ yield log(f"\n{'='*50}")
702
+ yield log(f"πŸ” Patch iteration {iteration}/{max_iterations}")
703
+
704
+ # Fetch logs
705
+ yield log("πŸ“‹ Fetching build logs...")
706
+ logs = space_mgr.get_logs(lines=80)
707
+ if not logs:
708
+ logs = "(no logs available yet β€” space may still be building)"
709
+ yield log(f"Logs preview:\n{logs[:500]}...")
710
+
711
+ # Check for NEED_CONTEXT in previous response
712
+ need_ctx_matches = re.findall(r"NEED_CONTEXT:\s*(.+)", response)
713
+ if need_ctx_matches:
714
+ extra_query = " ".join(need_ctx_matches)
715
+ yield log(f"πŸ” Agent needs more context: '{extra_query[:100]}'")
716
+ patch_rag = rag.query(extra_query, top_k=6, budget_tokens=10000)
717
+ else:
718
+ patch_rag = rag.query(goal + " " + logs, top_k=6, budget_tokens=10000)
719
+
720
+ patch_prompt = PROMPT_PATCH.format(
721
+ goal=goal,
722
+ logs=logs[:3000],
723
+ file_list="\n".join(space_files.keys()),
724
+ rag_chunks=patch_rag,
725
+ )
726
+
727
+ try:
728
+ response = rotator.generate(SYSTEM_ARCHITECT, patch_prompt, max_tokens=8192)
729
+ except Exception as e:
730
+ yield log(f"❌ Gemini error: {e}")
731
+ break
732
+
733
+ if "GOAL_COMPLETE" in response:
734
+ yield log("\nπŸŽ‰ GOAL COMPLETE! Agent confirmed success.")
735
+ break
736
+
737
+ # Apply patches
738
+ patches = extract_patches(response)
739
+ new_in_patch = extract_files_from_response(response)
740
+
741
+ if not patches and not new_in_patch:
742
+ yield log("⚠️ No patches or new files found in response.")
743
+ yield log(response[:1000])
744
+
745
+ for fpath, patch_str in patches.items():
746
+ original = space_files.get(fpath, "")
747
+ patched = apply_unified_diff(original, patch_str)
748
+ if patched != original:
749
+ try:
750
+ space_mgr.upload_content(patched, fpath)
751
+ space_files[fpath] = patched
752
+ yield log(f" 🩹 Patched: {fpath}")
753
+ except Exception as e:
754
+ yield log(f" ❌ Upload failed for {fpath}: {e}")
755
+ else:
756
+ yield log(f" ⚠️ Patch didn't apply cleanly for {fpath}")
757
+
758
+ # New files from patch response (shouldn't be many after init)
759
+ patch_new_incomplete = detect_incomplete(new_in_patch)
760
+ if patch_new_incomplete:
761
+ yield log(f" πŸ“ {len(patch_new_incomplete)} new file(s) incomplete in patch round β€” continuing...")
762
+ while patch_new_incomplete:
763
+ inc = patch_new_incomplete.pop(0)
764
+ yield log(f" ↩️ Continuing new file {inc.path} from line {inc.last_line}...")
765
+ cont_prompt = PROMPT_CONTINUE.format(
766
+ path=inc.path,
767
+ last_line=inc.last_line,
768
+ next_line=inc.last_line + 1,
769
+ resume_hint=inc.resume_hint,
770
+ goal=goal,
771
+ )
772
+ try:
773
+ cont_r = rotator.generate(SYSTEM_ARCHITECT, cont_prompt, max_tokens=8192)
774
+ except Exception as e:
775
+ yield log(f" ❌ {e}")
776
+ continue
777
+ cont = extract_continuation(cont_r, inc)
778
+ if cont:
779
+ if INCOMPLETE_SENTINEL in cont:
780
+ clean_c = cont[: cont.index(INCOMPLETE_SENTINEL)].rstrip()
781
+ appended = inc.content_so_far + "\n" + clean_c
782
+ tail_lines = appended.splitlines()
783
+ patch_new_incomplete.append(IncompleteFile(
784
+ path=inc.path,
785
+ content_so_far=appended,
786
+ last_line=len(appended.splitlines()),
787
+ resume_hint="\n".join(tail_lines[-20:]),
788
+ ))
789
+ new_in_patch[inc.path] = appended
790
+ else:
791
+ new_in_patch[inc.path] = inc.content_so_far + "\n" + cont
792
+ yield log(f" βœ… {inc.path} complete ({len(new_in_patch[inc.path].splitlines())} lines)")
793
+
794
+ for fpath, content in new_in_patch.items():
795
+ if fpath not in space_files:
796
+ try:
797
+ space_mgr.upload_content(content, fpath)
798
+ space_files[fpath] = content
799
+ yield log(f" βž• New file: {fpath}")
800
+ except Exception as e:
801
+ yield log(f" ❌ {fpath}: {e}")
802
+
803
+ if patches or new_in_patch:
804
+ yield log("⏳ Waiting for rebuild (45s)...")
805
+ time.sleep(45)
806
+
807
+ yield log(f"\n✨ Done! Space: {space_mgr.space_url()}")
808
+ yield log(f"πŸ“Š Files in space: {', '.join(space_files.keys())}")
809
+
810
+
811
+ # ─── Gradio UI ────────────────────────────────────────────────────────────────
812
+
813
+ CSS = """
814
+ @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;700&family=Syne:wght@400;700;800&display=swap');
815
+
816
+ :root {
817
+ --bg: #0a0a0f;
818
+ --surface: #111118;
819
+ --border: #1e1e2e;
820
+ --accent: #7c3aed;
821
+ --accent2: #06b6d4;
822
+ --text: #e2e8f0;
823
+ --muted: #64748b;
824
+ --success: #10b981;
825
+ --danger: #ef4444;
826
+ }
827
+
828
+ body, .gradio-container {
829
+ background: var(--bg) !important;
830
+ font-family: 'Syne', sans-serif !important;
831
+ color: var(--text) !important;
832
+ }
833
+
834
+ .gradio-container { max-width: 1100px !important; margin: 0 auto !important; }
835
+
836
+ h1.title {
837
+ font-family: 'Syne', sans-serif;
838
+ font-size: 2.8rem;
839
+ font-weight: 800;
840
+ background: linear-gradient(135deg, #7c3aed, #06b6d4);
841
+ -webkit-background-clip: text;
842
+ -webkit-text-fill-color: transparent;
843
+ margin: 0;
844
+ letter-spacing: -0.04em;
845
+ }
846
+
847
+ .subtitle { color: var(--muted); font-size: 0.95rem; margin-top: 4px; font-family: 'JetBrains Mono', monospace; }
848
+
849
+ .panel {
850
+ background: var(--surface);
851
+ border: 1px solid var(--border);
852
+ border-radius: 12px;
853
+ padding: 20px;
854
+ }
855
+
856
+ label { color: var(--muted) !important; font-size: 0.82rem !important; font-family: 'JetBrains Mono', monospace !important; }
857
+
858
+ input, textarea {
859
+ background: var(--bg) !important;
860
+ border: 1px solid var(--border) !important;
861
+ color: var(--text) !important;
862
+ font-family: 'JetBrains Mono', monospace !important;
863
+ border-radius: 8px !important;
864
+ }
865
+
866
+ input:focus, textarea:focus {
867
+ border-color: var(--accent) !important;
868
+ box-shadow: 0 0 0 3px rgba(124,58,237,0.15) !important;
869
+ }
870
+
871
+ button.primary {
872
+ background: linear-gradient(135deg, var(--accent), var(--accent2)) !important;
873
+ border: none !important;
874
+ border-radius: 10px !important;
875
+ font-family: 'Syne', sans-serif !important;
876
+ font-weight: 700 !important;
877
+ font-size: 1rem !important;
878
+ letter-spacing: 0.02em !important;
879
+ padding: 14px 28px !important;
880
+ color: white !important;
881
+ cursor: pointer !important;
882
+ transition: opacity 0.2s !important;
883
+ }
884
+ button.primary:hover { opacity: 0.88 !important; }
885
+
886
+ .log-box textarea {
887
+ font-family: 'JetBrains Mono', monospace !important;
888
+ font-size: 0.78rem !important;
889
+ background: #050508 !important;
890
+ color: #a0f0b0 !important;
891
+ border: 1px solid #1a2a1a !important;
892
+ }
893
+
894
+ .tip {
895
+ font-family: 'JetBrains Mono', monospace;
896
+ font-size: 0.75rem;
897
+ color: var(--muted);
898
+ border-left: 2px solid var(--accent);
899
+ padding-left: 10px;
900
+ margin-top: 8px;
901
+ }
902
+ """
903
+
904
+ def build_ui():
905
+ with gr.Blocks(css=CSS, title="RepoForge") as demo:
906
+
907
+ gr.HTML("""
908
+ <div style="padding: 32px 0 8px 0;">
909
+ <h1 class="title">RepoForge</h1>
910
+ <p class="subtitle">// agentic repo conversion Β· surgical patches Β· rag-powered context</p>
911
+ </div>
912
+ """)
913
+
914
+ with gr.Row():
915
+ # ── Left column: config ──
916
+ with gr.Column(scale=1):
917
+ gr.HTML('<div class="panel">')
918
+
919
+ gr.HTML('<p style="font-size:0.85rem;color:#7c3aed;font-weight:700;margin:0 0 12px;">πŸ”‘ CREDENTIALS</p>')
920
+
921
+ hf_token = gr.Textbox(
922
+ label="HuggingFace Access Token",
923
+ placeholder="hf_...",
924
+ type="password",
925
+ lines=1,
926
+ )
927
+
928
+ gemini_keys = gr.Textbox(
929
+ label="Gemini API Keys (one per line or comma-separated)",
930
+ placeholder="AIzaSy...\nAIzaSy...",
931
+ lines=4,
932
+ type="password",
933
+ )
934
+ gr.HTML('<p class="tip">Multiple keys β†’ auto-rotates on quota exhaustion</p>')
935
+
936
+ gr.HTML('<p style="font-size:0.85rem;color:#06b6d4;font-weight:700;margin:16px 0 12px;">πŸ“¦ SOURCE</p>')
937
+
938
+ source_input = gr.Textbox(
939
+ label="Source (GitHub URL / HF Space URL / Local Folder Path)",
940
+ placeholder="https://github.com/owner/repo",
941
+ lines=1,
942
+ )
943
+
944
+ space_name = gr.Textbox(
945
+ label="New HF Space Name",
946
+ placeholder="my-converted-app",
947
+ lines=1,
948
+ )
949
+
950
+ gr.HTML('<p style="font-size:0.85rem;color:#10b981;font-weight:700;margin:16px 0 12px;">🎯 GOAL</p>')
951
+
952
+ goal = gr.Textbox(
953
+ label="Conversion Goal",
954
+ placeholder="Convert this Python Flask app to a Rust Axum web server with identical API endpoints",
955
+ lines=4,
956
+ )
957
+
958
+ max_iters = gr.Slider(
959
+ label="Max patch iterations",
960
+ minimum=2, maximum=20, step=1, value=8
961
+ )
962
+
963
+ chunk_lines_slider = gr.Slider(
964
+ label="RAG chunk size (lines per chunk, 300–1200)",
965
+ minimum=300, maximum=1200, step=100, value=600,
966
+ info="Larger = more context per RAG hit, more tokens used"
967
+ )
968
+
969
+ run_btn = gr.Button("⚑ Launch RepoForge", variant="primary", elem_classes=["primary"])
970
+
971
+ gr.HTML('</div>')
972
+
973
+ # ── Right column: logs ──
974
+ with gr.Column(scale=1):
975
+ gr.HTML('<div class="panel" style="height:100%;">')
976
+ gr.HTML('<p style="font-size:0.85rem;color:#7c3aed;font-weight:700;margin:0 0 12px;">πŸ“Ÿ AGENT LOG</p>')
977
+
978
+ log_output = gr.Textbox(
979
+ label="",
980
+ lines=30,
981
+ max_lines=60,
982
+ interactive=False,
983
+ elem_classes=["log-box"],
984
+ show_copy_button=True,
985
+ )
986
+
987
+ gr.HTML('</div>')
988
+
989
+ # ── How it works ──
990
+ with gr.Accordion("ℹ️ How RepoForge works", open=False):
991
+ gr.Markdown("""
992
+ **RepoForge** is a fully agentic repo-conversion loop:
993
+
994
+ 1. **Clones** your source repo (GitHub, HF Space, or local folder)
995
+ 2. **Indexes** it with a lightweight RAG system β€” only relevant chunks are sent to the LLM, never the whole repo
996
+ 3. **Creates** a HuggingFace Space with the Docker preset
997
+ 4. **Generates** `Dockerfile`, `vite.config.js`, `README.md`, and initial source files
998
+ 5. **Iterates**: fetches build logs β†’ asks Gemini to output surgical `unified diff` patches β†’ applies them β†’ waits for rebuild
999
+ 6. **Never** fully regenerates a file after initial creation β€” only patches
1000
+ 7. **Rotates** Gemini API keys automatically when quota is hit
1001
+
1002
+ **Source input formats:**
1003
+ - `https://github.com/owner/repo` β€” cloned via git
1004
+ - `https://huggingface.co/spaces/owner/name` β€” cloned from HF
1005
+ - `/home/user/myproject` β€” local folder (must be accessible)
1006
+ """)
1007
+
1008
+ # ── Wire up ──
1009
+ accumulated_logs = gr.State("")
1010
+
1011
+ def stream_wrapper(hf_tok, gem_keys, goal_txt, src, sname, iters, chunk_sz, prev_logs):
1012
+ all_logs = prev_logs or ""
1013
+ for chunk in run_agent(
1014
+ hf_token=hf_tok,
1015
+ gemini_keys_raw=gem_keys,
1016
+ goal=goal_txt,
1017
+ source_input=src,
1018
+ space_name=sname,
1019
+ max_iterations=int(iters),
1020
+ chunk_lines=int(chunk_sz),
1021
+ ):
1022
+ all_logs += chunk
1023
+ yield all_logs, all_logs
1024
+
1025
+ run_btn.click(
1026
+ fn=stream_wrapper,
1027
+ inputs=[hf_token, gemini_keys, goal, source_input, space_name, max_iters, chunk_lines_slider, accumulated_logs],
1028
+ outputs=[log_output, accumulated_logs],
1029
+ )
1030
+
1031
+ return demo
1032
+
1033
+
1034
+ if __name__ == "__main__":
1035
+ app = build_ui()
1036
+ app.launch(server_port=7860, share=False)