rahul7star's picture
new fix no stry duplicate test
33c755d verified
import os
import time
import json
import re
import threading
import random
import string
import hashlib
from datetime import datetime
from io import BytesIO
import requests
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from huggingface_hub import HfApi, list_repo_files
# =====================================================
# CONFIG
# =====================================================
HF_SPACE_URL = (
"https://rahul7star-qwen3-4b-thinking-2509-ai-storey.hf.space/v1/chat/completions"
)
HF_TOKEN = os.getenv("HF_TOKEN")
DATASET_REPO = "rahul7star/zimg-story-factory"
api = HfApi(token=HF_TOKEN)
app = FastAPI(title="auto-storey")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# =====================================================
# LOGGING
# =====================================================
def log(msg):
ts = time.strftime("%Y-%m-%d %H:%M:%S")
line = f"[{ts}] {msg}"
print(line)
return line
# =====================================================
# UNIQUE JOB ID
# =====================================================
def generate_job_id(title: str = ""):
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
rand = ''.join(
random.choices(string.ascii_lowercase + string.digits, k=5)
)
if title:
h = hashlib.md5(title.encode()).hexdigest()[:4]
return f"job_{ts}_{rand}_{h}"
return f"job_{ts}_{rand}"
# =====================================================
# HF RESPONSE EXTRACTOR
# =====================================================
def extract_llm_content(resp_json):
return (
resp_json.get("choices", [{}])[0]
.get("message", {})
.get("content", "")
.strip()
)
# =====================================================
# CALL HF SPACE
# =====================================================
def llama_chat(system, user):
headers = {"Content-Type": "application/json"}
if HF_TOKEN:
headers["Authorization"] = f"Bearer {HF_TOKEN}"
payload = {
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"temperature": 1.05,
"top_p": 0.92,
"presence_penalty": 0.6,
"frequency_penalty": 0.4,
"stream": False,
"max_tokens": 1024,
}
log("Sending request to HF Space")
r = requests.post(HF_SPACE_URL, json=payload, headers=headers, timeout=300)
r.raise_for_status()
resp_json = r.json()
log(f"Response received")
content = extract_llm_content(resp_json)
if not content:
raise ValueError("Empty LLM response")
return content
# =====================================================
# DATASET TITLE SCAN
# =====================================================
def fetch_existing_titles(dataset_repo):
try:
files = list_repo_files(dataset_repo, repo_type="dataset")
titles = []
for f in files:
if f.endswith(".md"):
folder = f.split("/")[0]
parts = folder.split("_", 2)
if len(parts) == 3:
titles.append(parts[2].lower())
return set(titles)
except Exception as e:
log(f"Dataset scan failed: {e}")
return set()
def title_similarity(a, b):
a_words = set(a.split("_"))
b_words = set(b.split("_"))
union = len(a_words | b_words)
if union == 0:
return 0
return len(a_words & b_words) / union
# =====================================================
# CLEAN STORY
# =====================================================
def clean_generated_story(story_text, prompt):
text = story_text.strip()
if text.lower().startswith(prompt.lower()):
text = text[len(prompt):].strip()
paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
seen = set()
unique = []
for p in paragraphs:
if p not in seen:
unique.append(p)
seen.add(p)
return "\n\n".join(unique[:3])
# =====================================================
# UNIQUE IDEA GENERATOR
# =====================================================
def generate_idea(max_retries=6):
existing_titles = fetch_existing_titles(DATASET_REPO)
system = """
You are a highly creative children's author.
Rules:
- Title MUST be UNIQUE.
- Title should usually be 3 words.
- Avoid patterns like "The Day", "Moon", "Magic", "Adventure".
- Be imaginative and unexpected.
- Return STRICT JSON only.
"""
user = """
Invent a UNIQUE children's story idea.
Return JSON:
{
"title":"",
"character":"",
"setting":"",
"twist":"",
"style":"cinematic storybook, kid-friendly",
"author":"OhamLab"
}
"""
for attempt in range(max_retries):
log(f"Idea attempt {attempt+1}")
raw = llama_chat(system, user)
raw = raw.replace("```json", "").replace("```", "")
match = re.search(r"\{.*\}", raw, re.S)
if not match:
continue
idea = json.loads(match.group())
title_clean = re.sub(r"\W+", "_", idea["title"]).lower()
if title_clean in existing_titles:
log("Duplicate title → retry")
continue
similar = any(
title_similarity(title_clean, t) > 0.6
for t in existing_titles
)
if similar:
log("Similar title → retry")
continue
log(f"Unique idea accepted: {idea['title']}")
return idea
raise RuntimeError("Failed to generate unique idea")
# =====================================================
# STORY GENERATION
# =====================================================
def generate_story_full(idea):
prompt = (
"Write ONLY story text. EXACTLY 3 paragraphs.\n\n"
f"Character: {idea['character']}\n"
f"Setting: {idea['setting']}\n"
f"Twist: {idea['twist']}\n"
f"Style: {idea['style']}\n"
)
raw_story = llama_chat(
"You are an imaginative children's storyteller. Never repeat common plots.",
prompt,
)
return clean_generated_story(raw_story, prompt)
# =====================================================
# JOB PUBLISHER
# =====================================================
def publish_job(dataset_repo, folder_name, filename,
title, author, theme,
style_descriptor, length, email,
job_id):
raw_url = (
f"https://huggingface.co/datasets/"
f"{dataset_repo}/resolve/main/{folder_name}/{filename}"
)
job = {
"job_id": job_id,
"email": email,
"story_url": raw_url,
"repo": dataset_repo,
"folder": folder_name,
"title": title,
"author": author,
"theme": theme,
"style_descriptor": style_descriptor,
"length": length,
"status": "NEW",
"created_at": datetime.utcnow().isoformat()
}
job_path = "/tmp/job.json"
with open(job_path, "w") as f:
json.dump(job, f, indent=2)
api.upload_file(
path_or_fileobj=job_path,
path_in_repo=f"jobs/{job_id}.json",
repo_id=dataset_repo,
repo_type="dataset"
)
# =====================================================
# STORY PIPELINE
# =====================================================
def create_story_pipeline():
log("🧠 Generating idea...")
idea = generate_idea()
log("✍️ Writing story...")
story_text = generate_story_full(idea)
title = idea["title"]
job_id = generate_job_id(title)
safe_title = re.sub(r"\W+", "_", title.strip())
folder_name = f"{job_id}_{safe_title}"
filename = f"{safe_title}.md"
api.upload_file(
path_or_fileobj=BytesIO(story_text.encode()),
path_in_repo=f"{folder_name}/{filename}",
repo_id=DATASET_REPO,
repo_type="dataset"
)
publish_job(
DATASET_REPO,
folder_name,
filename,
title,
idea.get("author", "OhamLab"),
idea["character"],
idea["style"],
"Short",
"auto@system.ai",
job_id
)
log(f"✅ Job completed: {job_id}")
return {
"job_id": job_id,
"title": title,
"idea": idea,
"story_preview": story_text,
}
# =====================================================
# API
# =====================================================
@app.get("/create-story")
@app.post("/create-story")
def create_story():
try:
return create_story_pipeline()
except Exception as e:
log(f"Error: {e}")
return {"error": str(e)}
@app.get("/health")
def health():
return {"status": "ok"}
# =====================================================
# SCHEDULER
# =====================================================
def scheduler():
while True:
try:
log("🚀 Scheduler triggered")
create_story_pipeline()
except Exception as e:
log(f"Scheduler error: {e}")
time.sleep(900)
@app.on_event("startup")
def start_scheduler():
threading.Thread(target=scheduler, daemon=True).start()
# =====================================================
# LOCAL RUN
# =====================================================
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)