Multi_Agent_Model / src /tools /refine_knowledge.py
sonthaiha
Fresh Deployment with LFS
1804a7a
import os
import json
import time
import glob
from pypdf import PdfReader
from openai import OpenAI
# --- CONFIGURATION ---
API_KEY = "sk-11dbe37b519f43a2939caccd3a7beabe" # Your Key
BASE_URL = "https://api.deepseek.com"
MODEL_NAME = "deepseek-reasoner" # The R1 model
DOCS_DIR = "/content/drive/MyDrive/ProjectA_Backup/src/data/docs"
OUTPUT_FILE = "/content/drive/MyDrive/ProjectA_Backup/src/data/distilled_reasoning_deepseek.jsonl"
def extract_content_from_files():
print(f"πŸ“‚ Scanning {DOCS_DIR}...")
docs = []
# 1. Read PDFs
pdf_files = glob.glob(os.path.join(DOCS_DIR, "*.pdf"))
for fpath in pdf_files:
try:
reader = PdfReader(fpath)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
docs.append({"source": os.path.basename(fpath), "text": text})
except Exception as e:
print(f"❌ Error reading PDF {fpath}: {e}")
# 2. Read TXT/MD (Your Policy Files)
txt_files = glob.glob(os.path.join(DOCS_DIR, "*.txt"))
for fpath in txt_files:
try:
with open(fpath, "r", encoding="utf-8") as f:
docs.append({"source": os.path.basename(fpath), "text": f.read()})
except Exception as e:
print(f"❌ Error reading TXT {fpath}: {e}")
print(f"βœ… Loaded {len(docs)} documents.")
return docs
def chunk_text(text, chunk_size=2000):
"""Splits long text into manageable chunks for the Teacher."""
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def generate_synthetic_data(client, chunks):
dataset = []
print(f"πŸ§ͺ Starting Distillation using {MODEL_NAME}...")
print(" (This takes time because DeepSeek 'Thinks' before answering)")
for i, chunk in enumerate(chunks):
print(f" πŸ‘‰ Processing Chunk {i+1}/{len(chunks)}...")
# The Teacher Prompt
prompt = f'''
SOURCE DOCUMENT:
"""{chunk}"""
TASK:
You are an Expert Retail Data Generator.
Based on the text above, create a Realistic Scenario for a Vietnamese Store Owner.
1. **User Query:** A specific, natural question a user would ask about this topic (in Vietnamese).
2. **Reasoning:** Explain HOW to solve it based on the text (Chain of Thought).
3. **Response:** The final answer to the user (in Vietnamese).
OUTPUT JSON FORMAT:
{{
"user_query": "...",
"thought_process": "...",
"response": "..."
}}
'''
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": "You are a helpful assistant. Output JSON only."},
{"role": "user", "content": prompt}
],
response_format={ 'type': 'json_object' }
)
# DeepSeek Reasoner separates 'reasoning_content' (internal) and 'content' (final)
# But for training data, we want to CAPTURE the reasoning to teach Qwen.
# Note: The API puts the "Think" trace in a special field sometimes,
# but usually 'deepseek-reasoner' outputs the final answer in 'content'.
# We trust the model to follow the JSON schema we gave it in the prompt.
content = response.choices[0].message.content
data = json.loads(content)
# Construct Training Entry
# We wrap the thought in <think> tags so Qwen learns to mimic it
full_response = f"<think>{data['thought_process']}</think>\n{data['response']}"
entry = {
"messages": [
{"role": "system", "content": "You are Project A, an expert Retail Consultant."},
{"role": "user", "content": data['user_query']},
{"role": "assistant", "content": full_response}
]
}
dataset.append(entry)
print(f" βœ… Generated: {data['user_query'][:50]}...")
except Exception as e:
print(f" ⚠️ API Error: {e}")
return dataset
def main():
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
# 1. Load Docs
docs = extract_content_from_files()
if not docs:
print("❌ No documents found in src/data/docs")
return
# 2. Chunking
all_chunks = []
for doc in docs:
# Limit to first 3 chunks per doc to save API credits for this test
# Remove [:3] to process the whole file
file_chunks = chunk_text(doc['text'])[:3]
all_chunks.extend(file_chunks)
print(f"πŸ“Š Prepared {len(all_chunks)} chunks for distillation.")
# 3. Distill
new_data = generate_synthetic_data(client, all_chunks)
# 4. Save
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for entry in new_data:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"πŸŽ‰ Distillation Complete! Saved {len(new_data)} samples.")
print(f"πŸ“ File: {OUTPUT_FILE}")
print("πŸ‘‰ Now combine this with your 'training_dataset.jsonl' and Fine-Tune!")
if __name__ == "__main__":
main()