Spaces:

catninja123
/

mash-stylebart-trainer

Paused

App Files Files Community

catninja123 commited on Mar 8

Commit

79ad8bc

verified ·

1 Parent(s): c7813a5

Upload src/generate_grok_pairs.py with huggingface_hub

Browse files

Files changed (1) hide show

src/generate_grok_pairs.py +156 -0

src/generate_grok_pairs.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+Generate AI paraphrase pairs using Grok API (xAI).
+Uses the cleaned human texts and generates Grok-style AI versions.
+This complements the existing Gemini-generated pairs for multi-LLM training.
+"""
+import json
+import os
+import time
+import asyncio
+import aiohttp
+XAI_API_KEY = os.environ.get('XAI_API_KEY', '')
+XAI_API_URL = 'https://api.x.ai/v1/chat/completions'
+INPUT_FILE = '/home/ubuntu/mash_training/data/human_texts_clean.jsonl'
+OUTPUT_FILE = '/home/ubuntu/mash_training/data/grok_pairs.jsonl'
+SYSTEM_PROMPT = """You are a writing assistant. Your task is to paraphrase the given text while:
+1. Keeping ALL the same information and meaning
+2. Using a polished, professional AI writing style
+3. Making it sound like it was written by an AI language model
+4. Using smooth transitions, parallel structures, and sophisticated vocabulary
+5. Maintaining the same approximate length
+Do NOT add new information. Do NOT remove any information. Just rephrase it in a polished AI style.
+Output ONLY the paraphrased text, nothing else."""
+CONCURRENCY = 15  # Increased for faster generation
+semaphore = asyncio.Semaphore(CONCURRENCY)
+async def paraphrase_one(session, essay_id, text, essay_type, retries=3):
+    """Paraphrase one text using Grok API."""
+    type_hint = "personal statement" if essay_type == "ps" else "college supplement essay"
+    headers = {
+        'Authorization': f'Bearer {XAI_API_KEY}',
+        'Content-Type': 'application/json',
+    }
+    payload = {
+        'model': 'grok-3-mini-fast',
+        'messages': [
+            {'role': 'system', 'content': SYSTEM_PROMPT},
+            {'role': 'user', 'content': f'Paraphrase this {type_hint} excerpt in AI style:\n\n{text}'},
+        ],
+        'temperature': 0.7,
+        'max_tokens': max(len(text.split()) * 3, 512),
+    }
+    async with semaphore:
+        for attempt in range(retries):
+            try:
+                async with session.post(XAI_API_URL, json=payload, headers=headers, timeout=aiohttp.ClientTimeout(total=60)) as resp:
+                    if resp.status == 429:
+                        # Rate limited - wait and retry
+                        await asyncio.sleep(5 * (attempt + 1))
+                        continue
+                    resp.raise_for_status()
+                    result = await resp.json()
+                    ai_text = result['choices'][0]['message']['content'].strip()
+                    # Basic validation
+                    if len(ai_text.split()) >= len(text.split()) * 0.4:
+                        return essay_id, ai_text
+                    else:
+                        continue
+            except Exception as e:
+                if attempt < retries - 1:
+                    await asyncio.sleep(2 ** attempt)
+                else:
+                    print(f"  ERROR {essay_id}: {e}", flush=True)
+                    return essay_id, None
+        return essay_id, None
+async def main():
+    if not XAI_API_KEY:
+        print("ERROR: XAI_API_KEY not set")
+        return
+    # Load clean human texts
+    data = []
+    with open(INPUT_FILE) as f:
+        for line in f:
+            data.append(json.loads(line))
+    print(f"Loaded {len(data)} clean samples", flush=True)
+    # Load existing progress
+    done_ids = set()
+    if os.path.exists(OUTPUT_FILE):
+        with open(OUTPUT_FILE) as f:
+            for line in f:
+                d = json.loads(line)
+                done_ids.add(d['essay_id'])
+        print(f"Already done: {len(done_ids)}", flush=True)
+    # Filter remaining
+    remaining = [d for d in data if d['essay_id'] not in done_ids]
+    print(f"Remaining: {len(remaining)}", flush=True)
+    if not remaining:
+        print("All done!")
+        return
+    # Process in batches
+    batch_size = 30
+    total_done = 0
+    total_errors = 0
+    start_time = time.time()
+    async with aiohttp.ClientSession() as session:
+        with open(OUTPUT_FILE, 'a', encoding='utf-8') as out:
+            for batch_start in range(0, len(remaining), batch_size):
+                batch = remaining[batch_start:batch_start + batch_size]
+                coros = [
+                    paraphrase_one(session, d['essay_id'], d['human_text'], d['type'])
+                    for d in batch
+                ]
+                results = await asyncio.gather(*coros)
+                for (essay_id, ai_text), orig in zip(results, batch):
+                    if ai_text:
+                        pair = {
+                            'essay_id': orig['essay_id'],
+                            'type': orig['type'],
+                            'tier': orig.get('tier', 'unknown'),
+                            'year': orig.get('year', 'unknown'),
+                            'human_text': orig['human_text'],
+                            'ai_text': ai_text,
+                            'human_words': len(orig['human_text'].split()),
+                            'ai_words': len(ai_text.split()),
+                            'ai_model': 'grok-3-mini-fast',
+                        }
+                        out.write(json.dumps(pair, ensure_ascii=False) + '\n')
+                        total_done += 1
+                    else:
+                        total_errors += 1
+                out.flush()
+                elapsed = time.time() - start_time
+                rate = (total_done + total_errors) / elapsed if elapsed > 0 else 0
+                remaining_count = len(remaining) - batch_start - len(batch)
+                eta = remaining_count / rate / 60 if rate > 0 else 0
+                print(f"  Batch {batch_start//batch_size + 1}: "
+                      f"{total_done} done, {total_errors} errors, "
+                      f"{rate:.1f}/s, ETA {eta:.0f}min", flush=True)
+    elapsed = time.time() - start_time
+    print(f"\nDONE: {total_done} pairs, {total_errors} errors in {elapsed/60:.1f} min", flush=True)
+if __name__ == '__main__':
+    asyncio.run(main())