Spaces:
Paused
Paused
File size: 5,428 Bytes
1804a7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import os
import json
import time
import glob
from pypdf import PdfReader
from openai import OpenAI
# --- CONFIGURATION ---
API_KEY = "sk-11dbe37b519f43a2939caccd3a7beabe" # Your Key
BASE_URL = "https://api.deepseek.com"
MODEL_NAME = "deepseek-reasoner" # The R1 model
DOCS_DIR = "/content/drive/MyDrive/ProjectA_Backup/src/data/docs"
OUTPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/distilled_reasoning_deepseek.jsonl"
def extract_content_from_files():
print(f"π Scanning {DOCS_DIR}...")
docs = []
# 1. Read PDFs
pdf_files = glob.glob(os.path.join(DOCS_DIR, "*.pdf"))
for fpath in pdf_files:
try:
reader = PdfReader(fpath)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
docs.append({"source": os.path.basename(fpath), "text": text})
except Exception as e:
print(f"β Error reading PDF {fpath}: {e}")
# 2. Read TXT/MD (Your Policy Files)
txt_files = glob.glob(os.path.join(DOCS_DIR, "*.txt"))
for fpath in txt_files:
try:
with open(fpath, "r", encoding="utf-8") as f:
docs.append({"source": os.path.basename(fpath), "text": f.read()})
except Exception as e:
print(f"β Error reading TXT {fpath}: {e}")
print(f"β
Loaded {len(docs)} documents.")
return docs
def chunk_text(text, chunk_size=2000):
"""Splits long text into manageable chunks for the Teacher."""
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def generate_synthetic_data(client, chunks):
dataset = []
print(f"π§ͺ Starting Distillation using {MODEL_NAME}...")
print(" (This takes time because DeepSeek 'Thinks' before answering)")
for i, chunk in enumerate(chunks):
print(f" π Processing Chunk {i+1}/{len(chunks)}...")
# The Teacher Prompt
prompt = f'''
SOURCE DOCUMENT:
"""{chunk}"""
TASK:
You are an Expert Retail Data Generator.
Based on the text above, create a Realistic Scenario for a Vietnamese Store Owner.
1. **User Query:** A specific, natural question a user would ask about this topic (in Vietnamese).
2. **Reasoning:** Explain HOW to solve it based on the text (Chain of Thought).
3. **Response:** The final answer to the user (in Vietnamese).
OUTPUT JSON FORMAT:
{{
"user_query": "...",
"thought_process": "...",
"response": "..."
}}
'''
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": "You are a helpful assistant. Output JSON only."},
{"role": "user", "content": prompt}
],
response_format={ 'type': 'json_object' }
)
# DeepSeek Reasoner separates 'reasoning_content' (internal) and 'content' (final)
# But for training data, we want to CAPTURE the reasoning to teach Qwen.
# Note: The API puts the "Think" trace in a special field sometimes,
# but usually 'deepseek-reasoner' outputs the final answer in 'content'.
# We trust the model to follow the JSON schema we gave it in the prompt.
content = response.choices[0].message.content
data = json.loads(content)
# Construct Training Entry
# We wrap the thought in <think> tags so Qwen learns to mimic it
full_response = f"<think>{data['thought_process']}</think>\n{data['response']}"
entry = {
"messages": [
{"role": "system", "content": "You are Project A, an expert Retail Consultant."},
{"role": "user", "content": data['user_query']},
{"role": "assistant", "content": full_response}
]
}
dataset.append(entry)
print(f" β
Generated: {data['user_query'][:50]}...")
except Exception as e:
print(f" β οΈ API Error: {e}")
return dataset
def main():
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
# 1. Load Docs
docs = extract_content_from_files()
if not docs:
print("β No documents found in src/data/docs")
return
# 2. Chunking
all_chunks = []
for doc in docs:
# Limit to first 3 chunks per doc to save API credits for this test
# Remove [:3] to process the whole file
file_chunks = chunk_text(doc['text'])[:3]
all_chunks.extend(file_chunks)
print(f"π Prepared {len(all_chunks)} chunks for distillation.")
# 3. Distill
new_data = generate_synthetic_data(client, all_chunks)
# 4. Save
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for entry in new_data:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"π Distillation Complete! Saved {len(new_data)} samples.")
print(f"π File: {OUTPUT_FILE}")
print("π Now combine this with your 'training_dataset.jsonl' and Fine-Tune!")
if __name__ == "__main__":
main() |