import gradio as gr import fitz # PyMuPDF from huggingface_hub import InferenceClient, HfApi import datetime import os # ── Config ───────────────────────────────────────────────────────────────────── MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3" HF_TOKEN = os.environ.get("HF_TOKEN", "") # Space → Settings → Secrets SAVE_REPO = os.environ.get("SAVE_REPO", "") # e.g. "username/research-conversations" # ── PDF extraction ───────────────────────────────────────────────────────────── def extract_pdf_text(pdf_file) -> str: if pdf_file is None: return "" doc = fitz.open(pdf_file.name) pages = [page.get_text() for page in doc] doc.close() text = "\n\n".join(pages).strip() words = text.split() if len(words) > 6000: text = " ".join(words[:6000]) + "\n\n[... article truncated ...]" return text # ── Fixed first-generation prompt ───────────────────────────────────────────── def build_first_prompt(article_text: str) -> str: return f"""Generate a conversation between an anchor and an author based on the article below. The conversation should cover the main points of the article in a question-and-answer format. Make it as long as possible, but keep it relevant to the article content and to not exceed 40-50 turns. Each turn will start with the participant's role in square brackets, followed by a colon and their utterance. Make sure that all utterances have the speaker annotation.: [ANCHOR]: ... [AUTHOR]: ... Only one utterance per turn. Use only information from the article. Do not invent facts not found in the article. Make the conversation engaging and informative. Make it sound natural and human-like. Ignore the Acknowledgment section of the article. Ignore the links in the article. Ignore the references in the article. Try to discuss a little bit about the results presented in the tables of the articles. Make the first part of the entry utterance using: Good day everyone, welcome to our show. Today we have with us [Author's Name], the author of the article. Thank you for joining us. Make sure that the closing is natural, not leaving a question in the air. Make sure that there are only replies from the ANCHOR and AUTHOR, no other speakers or sentences added. VERY IMPORTANT: THE NR OF TURNS SHOULD BE EQUAL FOR BOTH ANCHOR AND AUTHOR. DO NOT GENERATE ANYTHING ELSE. ONLY PROVIDE THE CONVERSATION WITH THE ANNOTATIONS. Here is the ARTICLE: {article_text} Begin conversation:""" # ── Re-generation: user writes their own full prompt ────────────────────────── def build_reprompt(article_text: str, user_prompt: str, previous_conv: str) -> str: msg = user_prompt.strip() msg += f"\n\nARTICLE:\n{article_text}" if previous_conv.strip(): msg += f"\n\nPREVIOUS CONVERSATION (for reference):\n{previous_conv.strip()}" msg += "\n\nBegin conversation:" return msg # ── Call model ───────────────────────────────────────────────────────────────── SYSTEM = ( "You are a professional science-news podcast producer. " "You generate ONLY structured dialogues in the exact format requested by the user. " "Never add explanations, preamble, or text outside the conversation turns." ) def call_model(prompt: str, temperature: float = 0.7) -> str: client = InferenceClient(MODEL_ID, token=HF_TOKEN or None) result = client.chat_completion( messages=[ {"role": "system", "content": SYSTEM}, {"role": "user", "content": prompt}, ], max_tokens=3000, temperature=temperature, ) return result.choices[0].message.content.strip() # ── Generate (first time) ────────────────────────────────────────────────────── def generate_conversation(pdf_file, progress=gr.Progress()): if pdf_file is None: return "", "⚠️ Please upload a PDF first.", "" progress(0.1, desc="Extracting text from PDF…") article_text = extract_pdf_text(pdf_file) if not article_text: return "", "⚠️ Could not extract text. Make sure the PDF is not scanned or password-protected.", "" progress(0.35, desc="Sending to model…") try: conversation = call_model(build_first_prompt(article_text)) except Exception as e: return "", f"❌ Model error: {e}", "" turns = len([l for l in conversation.split("\n") if l.strip().startswith("[ANCHOR]") or l.strip().startswith("[AUTHOR]")]) info = f"✅ Generated — {turns} turns · {len(conversation.split())} words" progress(1.0) return conversation, info, article_text # ── Re-generate with user's custom prompt ───────────────────────────────────── def regenerate_conversation(user_prompt, current_conv, article_cache, pdf_file, progress=gr.Progress()): if not user_prompt.strip(): return current_conv, "⚠️ Write your prompt/instructions before regenerating.", article_cache article_text = article_cache if not article_text.strip(): if pdf_file is None: return current_conv, "⚠️ Upload the PDF first.", "" progress(0.1, desc="Extracting PDF…") article_text = extract_pdf_text(pdf_file) if not article_text: return current_conv, "⚠️ Could not extract PDF text.", "" progress(0.3, desc="Sending new prompt to model…") try: conversation = call_model( build_reprompt(article_text, user_prompt, current_conv), temperature=0.75, ) except Exception as e: return current_conv, f"❌ Model error: {e}", article_text turns = len([l for l in conversation.split("\n") if l.strip().startswith("[ANCHOR]") or l.strip().startswith("[AUTHOR]")]) info = f"✅ Regenerated — {turns} turns · {len(conversation.split())} words" progress(1.0) return conversation, info, article_text # ── Save to HuggingFace Hub ──────────────────────────────────────────────────── def save_to_hub(conversation: str, pdf_file): if not conversation.strip(): return "⚠️ Nothing to save — generate a conversation first." if not HF_TOKEN: return "⚠️ HF_TOKEN secret not set. Add it in Space → Settings → Repository secrets." if not SAVE_REPO: return "⚠️ SAVE_REPO secret not set. Add it as 'username/your-dataset-repo'." ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") pdf_name = "article" if pdf_file is not None: pdf_name = os.path.basename(pdf_file.name).replace(".pdf", "") path_in_repo = f"conversations/{pdf_name}_{ts}.txt" header = ( f"# Research News Podcast Conversation\n" f"Source article: {pdf_name}\n" f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}\n" f"Model: {MODEL_ID}\n" f"{'─'*60}\n\n" ) content = (header + conversation).encode("utf-8") try: api = HfApi(token=HF_TOKEN) api.create_repo(repo_id=SAVE_REPO, repo_type="dataset", exist_ok=True, private=True) api.upload_file( path_or_fileobj=content, path_in_repo=path_in_repo, repo_id=SAVE_REPO, repo_type="dataset", commit_message=f"Add conversation: {pdf_name}_{ts}", ) url = f"https://huggingface.co/datasets/{SAVE_REPO}/blob/main/{path_in_repo}" return f"✅ Saved → {url}" except Exception as e: return f"❌ Save error: {e}" # ── CSS ──────────────────────────────────────────────────────────────────────── CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@400;700;900&family=DM+Sans:wght@300;400;500&display=swap'); :root { --ink: #0e0c0a; --paper: #f5f0e8; --accent: #c8392b; --accent2: #1d4ed8; --muted: #7a7060; --border: #d4cfc4; --card: #fffdf8; } body, .gradio-container { background: var(--paper) !important; font-family: 'DM Sans', sans-serif !important; } .gradio-container { max-width: 1040px !important; margin: 0 auto !important; } .app-header { text-align:center; padding:36px 20px 22px; border-bottom:2.5px solid var(--ink); margin-bottom:28px; } .app-header h1 { font-family:'Playfair Display',Georgia,serif; font-size:2.5rem; font-weight:900; color:var(--ink); letter-spacing:-.02em; margin:0 0 4px; } .app-header .kicker { font-size:.75rem; letter-spacing:.18em; text-transform:uppercase; color:var(--accent); font-weight:500; margin-bottom:8px; } .app-header .subtitle { color:var(--muted); font-size:.95rem; max-width:560px; margin:0 auto; line-height:1.65; } .panel-label { font-family:'Playfair Display',serif; font-size:1rem; font-weight:700; color:var(--ink); border-bottom:1px solid var(--border); padding-bottom:7px; margin-bottom:10px; display:flex; align-items:center; gap:8px; } .step-badge { background:var(--ink); color:var(--paper); border-radius:50%; width:21px; height:21px; display:inline-flex; align-items:center; justify-content:center; font-size:.68rem; font-family:'DM Sans',sans-serif; font-weight:600; flex-shrink:0; } .upload-zone { border:2px dashed var(--border) !important; border-radius:6px !important; background:var(--card) !important; transition:border-color .2s; } .upload-zone:hover { border-color:var(--accent) !important; } button.btn-primary { background:var(--ink) !important; color:var(--paper) !important; border:none !important; border-radius:4px !important; font-family:'DM Sans',sans-serif !important; font-weight:500 !important; letter-spacing:.04em !important; transition:background .2s !important; } button.btn-primary:hover { background:var(--accent) !important; } button.btn-secondary { background:transparent !important; color:var(--ink) !important; border:1.5px solid var(--ink) !important; border-radius:4px !important; font-family:'DM Sans',sans-serif !important; font-weight:500 !important; transition:all .2s !important; } button.btn-secondary:hover { background:var(--ink) !important; color:var(--paper) !important; } button.btn-save { background:var(--accent2) !important; color:white !important; border:none !important; border-radius:4px !important; font-family:'DM Sans',sans-serif !important; font-weight:500 !important; transition:background .2s !important; } button.btn-save:hover { background:#1e3a8a !important; } .conversation-box textarea { font-family:'DM Mono','Courier New',monospace !important; font-size:.9rem !important; line-height:1.8 !important; background:var(--card) !important; border:1px solid var(--border) !important; border-radius:6px !important; color:var(--ink) !important; padding:14px !important; } .reprompt-box textarea { font-size:.92rem !important; line-height:1.6 !important; border:1.5px solid var(--accent2) !important; border-radius:6px !important; background:#f0f4ff !important; } .status-bar { font-size:.83rem; color:var(--muted); padding:4px 0; font-style:italic; } .section-div { border:none; border-top:1px solid var(--border); margin:20px 0; } .info-box { background:var(--card); border:1px solid var(--border); border-radius:6px; padding:12px 16px; font-size:.85rem; color:var(--muted); line-height:1.6; } """ # ── UI ───────────────────────────────────────────────────────────────────────── with gr.Blocks(css=CUSTOM_CSS, title="Research News Podcast Generator") as demo: article_cache = gr.State("") gr.HTML("""
Upload your scientific article and generate an engaging anchor–author dialogue for your research news show. Refine with your own prompt, then save to HuggingFace.
.txt file inside a private HuggingFace dataset repo.HF_TOKEN — your HuggingFace write tokenSAVE_REPO — e.g. yourname/research-conversations