Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import fitz # PyMuPDF | |
| from huggingface_hub import InferenceClient, HfApi | |
| import datetime | |
| import os | |
| # ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3" | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") # Space β Settings β Secrets | |
| SAVE_REPO = os.environ.get("SAVE_REPO", "") # e.g. "username/research-conversations" | |
| # ββ PDF extraction βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_pdf_text(pdf_file) -> str: | |
| if pdf_file is None: | |
| return "" | |
| doc = fitz.open(pdf_file.name) | |
| pages = [page.get_text() for page in doc] | |
| doc.close() | |
| text = "\n\n".join(pages).strip() | |
| words = text.split() | |
| if len(words) > 6000: | |
| text = " ".join(words[:6000]) + "\n\n[... article truncated ...]" | |
| return text | |
| # ββ Fixed first-generation prompt βββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_first_prompt(article_text: str) -> str: | |
| return f"""Generate a conversation between an anchor and an author based on the article below. | |
| The conversation should cover the main points of the article in a question-and-answer format. | |
| Make it as long as possible, but keep it relevant to the article content and to not exceed 40-50 turns. | |
| Each turn will start with the participant's role in square brackets, followed by a colon and their utterance. Make sure that all utterances have the speaker annotation.: | |
| [ANCHOR]: ... | |
| [AUTHOR]: ... | |
| Only one utterance per turn. Use only information from the article. | |
| Do not invent facts not found in the article. | |
| Make the conversation engaging and informative. | |
| Make it sound natural and human-like. | |
| Ignore the Acknowledgment section of the article. | |
| Ignore the links in the article. | |
| Ignore the references in the article. | |
| Try to discuss a little bit about the results presented in the tables of the articles. | |
| Make the first part of the entry utterance using: Good day everyone, welcome to our show. Today we have with us [Author's Name], the author of the article. Thank you for joining us. | |
| Make sure that the closing is natural, not leaving a question in the air. | |
| Make sure that there are only replies from the ANCHOR and AUTHOR, no other speakers or sentences added. | |
| VERY IMPORTANT: THE NR OF TURNS SHOULD BE EQUAL FOR BOTH ANCHOR AND AUTHOR. | |
| DO NOT GENERATE ANYTHING ELSE. ONLY PROVIDE THE CONVERSATION WITH THE ANNOTATIONS. | |
| Here is the | |
| ARTICLE: | |
| {article_text} | |
| Begin conversation:""" | |
| # ββ Re-generation: user writes their own full prompt ββββββββββββββββββββββββββ | |
| def build_reprompt(article_text: str, user_prompt: str, previous_conv: str) -> str: | |
| msg = user_prompt.strip() | |
| msg += f"\n\nARTICLE:\n{article_text}" | |
| if previous_conv.strip(): | |
| msg += f"\n\nPREVIOUS CONVERSATION (for reference):\n{previous_conv.strip()}" | |
| msg += "\n\nBegin conversation:" | |
| return msg | |
| # ββ Call model βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SYSTEM = ( | |
| "You are a professional science-news podcast producer. " | |
| "You generate ONLY structured dialogues in the exact format requested by the user. " | |
| "Never add explanations, preamble, or text outside the conversation turns." | |
| ) | |
| def call_model(prompt: str, temperature: float = 0.7) -> str: | |
| client = InferenceClient(MODEL_ID, token=HF_TOKEN or None) | |
| result = client.chat_completion( | |
| messages=[ | |
| {"role": "system", "content": SYSTEM}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| max_tokens=3000, | |
| temperature=temperature, | |
| ) | |
| return result.choices[0].message.content.strip() | |
| # ββ Generate (first time) ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_conversation(pdf_file, progress=gr.Progress()): | |
| if pdf_file is None: | |
| return "", "β οΈ Please upload a PDF first.", "" | |
| progress(0.1, desc="Extracting text from PDFβ¦") | |
| article_text = extract_pdf_text(pdf_file) | |
| if not article_text: | |
| return "", "β οΈ Could not extract text. Make sure the PDF is not scanned or password-protected.", "" | |
| progress(0.35, desc="Sending to modelβ¦") | |
| try: | |
| conversation = call_model(build_first_prompt(article_text)) | |
| except Exception as e: | |
| return "", f"β Model error: {e}", "" | |
| turns = len([l for l in conversation.split("\n") | |
| if l.strip().startswith("[ANCHOR]") or l.strip().startswith("[AUTHOR]")]) | |
| info = f"β Generated β {turns} turns Β· {len(conversation.split())} words" | |
| progress(1.0) | |
| return conversation, info, article_text | |
| # ββ Re-generate with user's custom prompt βββββββββββββββββββββββββββββββββββββ | |
| def regenerate_conversation(user_prompt, current_conv, article_cache, pdf_file, progress=gr.Progress()): | |
| if not user_prompt.strip(): | |
| return current_conv, "β οΈ Write your prompt/instructions before regenerating.", article_cache | |
| article_text = article_cache | |
| if not article_text.strip(): | |
| if pdf_file is None: | |
| return current_conv, "β οΈ Upload the PDF first.", "" | |
| progress(0.1, desc="Extracting PDFβ¦") | |
| article_text = extract_pdf_text(pdf_file) | |
| if not article_text: | |
| return current_conv, "β οΈ Could not extract PDF text.", "" | |
| progress(0.3, desc="Sending new prompt to modelβ¦") | |
| try: | |
| conversation = call_model( | |
| build_reprompt(article_text, user_prompt, current_conv), | |
| temperature=0.75, | |
| ) | |
| except Exception as e: | |
| return current_conv, f"β Model error: {e}", article_text | |
| turns = len([l for l in conversation.split("\n") | |
| if l.strip().startswith("[ANCHOR]") or l.strip().startswith("[AUTHOR]")]) | |
| info = f"β Regenerated β {turns} turns Β· {len(conversation.split())} words" | |
| progress(1.0) | |
| return conversation, info, article_text | |
| # ββ Save to HuggingFace Hub ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def save_to_hub(conversation: str, pdf_file): | |
| if not conversation.strip(): | |
| return "β οΈ Nothing to save β generate a conversation first." | |
| if not HF_TOKEN: | |
| return "β οΈ HF_TOKEN secret not set. Add it in Space β Settings β Repository secrets." | |
| if not SAVE_REPO: | |
| return "β οΈ SAVE_REPO secret not set. Add it as 'username/your-dataset-repo'." | |
| ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | |
| pdf_name = "article" | |
| if pdf_file is not None: | |
| pdf_name = os.path.basename(pdf_file.name).replace(".pdf", "") | |
| path_in_repo = f"conversations/{pdf_name}_{ts}.txt" | |
| header = ( | |
| f"# Research News Podcast Conversation\n" | |
| f"Source article: {pdf_name}\n" | |
| f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}\n" | |
| f"Model: {MODEL_ID}\n" | |
| f"{'β'*60}\n\n" | |
| ) | |
| content = (header + conversation).encode("utf-8") | |
| try: | |
| api = HfApi(token=HF_TOKEN) | |
| api.create_repo(repo_id=SAVE_REPO, repo_type="dataset", exist_ok=True, private=True) | |
| api.upload_file( | |
| path_or_fileobj=content, | |
| path_in_repo=path_in_repo, | |
| repo_id=SAVE_REPO, | |
| repo_type="dataset", | |
| commit_message=f"Add conversation: {pdf_name}_{ts}", | |
| ) | |
| url = f"https://huggingface.co/datasets/{SAVE_REPO}/blob/main/{path_in_repo}" | |
| return f"β Saved β {url}" | |
| except Exception as e: | |
| return f"β Save error: {e}" | |
| # ββ CSS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@400;700;900&family=DM+Sans:wght@300;400;500&display=swap'); | |
| :root { | |
| --ink: #0e0c0a; | |
| --paper: #f5f0e8; | |
| --accent: #c8392b; | |
| --accent2: #1d4ed8; | |
| --muted: #7a7060; | |
| --border: #d4cfc4; | |
| --card: #fffdf8; | |
| } | |
| body, .gradio-container { | |
| background: var(--paper) !important; | |
| font-family: 'DM Sans', sans-serif !important; | |
| } | |
| .gradio-container { max-width: 1040px !important; margin: 0 auto !important; } | |
| .app-header { text-align:center; padding:36px 20px 22px; border-bottom:2.5px solid var(--ink); margin-bottom:28px; } | |
| .app-header h1 { font-family:'Playfair Display',Georgia,serif; font-size:2.5rem; font-weight:900; color:var(--ink); letter-spacing:-.02em; margin:0 0 4px; } | |
| .app-header .kicker { font-size:.75rem; letter-spacing:.18em; text-transform:uppercase; color:var(--accent); font-weight:500; margin-bottom:8px; } | |
| .app-header .subtitle { color:var(--muted); font-size:.95rem; max-width:560px; margin:0 auto; line-height:1.65; } | |
| .panel-label { font-family:'Playfair Display',serif; font-size:1rem; font-weight:700; color:var(--ink); border-bottom:1px solid var(--border); padding-bottom:7px; margin-bottom:10px; display:flex; align-items:center; gap:8px; } | |
| .step-badge { background:var(--ink); color:var(--paper); border-radius:50%; width:21px; height:21px; display:inline-flex; align-items:center; justify-content:center; font-size:.68rem; font-family:'DM Sans',sans-serif; font-weight:600; flex-shrink:0; } | |
| .upload-zone { border:2px dashed var(--border) !important; border-radius:6px !important; background:var(--card) !important; transition:border-color .2s; } | |
| .upload-zone:hover { border-color:var(--accent) !important; } | |
| button.btn-primary { background:var(--ink) !important; color:var(--paper) !important; border:none !important; border-radius:4px !important; font-family:'DM Sans',sans-serif !important; font-weight:500 !important; letter-spacing:.04em !important; transition:background .2s !important; } | |
| button.btn-primary:hover { background:var(--accent) !important; } | |
| button.btn-secondary { background:transparent !important; color:var(--ink) !important; border:1.5px solid var(--ink) !important; border-radius:4px !important; font-family:'DM Sans',sans-serif !important; font-weight:500 !important; transition:all .2s !important; } | |
| button.btn-secondary:hover { background:var(--ink) !important; color:var(--paper) !important; } | |
| button.btn-save { background:var(--accent2) !important; color:white !important; border:none !important; border-radius:4px !important; font-family:'DM Sans',sans-serif !important; font-weight:500 !important; transition:background .2s !important; } | |
| button.btn-save:hover { background:#1e3a8a !important; } | |
| .conversation-box textarea { font-family:'DM Mono','Courier New',monospace !important; font-size:.9rem !important; line-height:1.8 !important; background:var(--card) !important; border:1px solid var(--border) !important; border-radius:6px !important; color:var(--ink) !important; padding:14px !important; } | |
| .reprompt-box textarea { font-size:.92rem !important; line-height:1.6 !important; border:1.5px solid var(--accent2) !important; border-radius:6px !important; background:#f0f4ff !important; } | |
| .status-bar { font-size:.83rem; color:var(--muted); padding:4px 0; font-style:italic; } | |
| .section-div { border:none; border-top:1px solid var(--border); margin:20px 0; } | |
| .info-box { background:var(--card); border:1px solid var(--border); border-radius:6px; padding:12px 16px; font-size:.85rem; color:var(--muted); line-height:1.6; } | |
| """ | |
| # ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(css=CUSTOM_CSS, title="Research News Podcast Generator") as demo: | |
| article_cache = gr.State("") | |
| gr.HTML(""" | |
| <div class="app-header"> | |
| <div class="kicker">π Tool for Research Authors</div> | |
| <h1>Research News<br>Podcast Generator</h1> | |
| <p class="subtitle"> | |
| Upload your scientific article and generate an engaging anchorβauthor dialogue | |
| for your research news show. Refine with your own prompt, then save to HuggingFace. | |
| </p> | |
| </div> | |
| """) | |
| with gr.Row(equal_height=False): | |
| # ββ Left: upload + reprompt βββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Column(scale=1, min_width=300): | |
| gr.HTML('<div class="panel-label"><span class="step-badge">1</span>Upload Article PDF</div>') | |
| pdf_input = gr.File(label="", file_types=[".pdf"], elem_classes=["upload-zone"]) | |
| gen_btn = gr.Button("β‘ Generate Conversation", elem_classes=["btn-primary"], variant="primary") | |
| gr.HTML('<hr class="section-div"><div class="panel-label"><span class="step-badge">2</span>Refine β write your own prompt</div>') | |
| gr.HTML(""" | |
| <div class="info-box" style="margin-bottom:10px;"> | |
| Not happy with the result? Write your own prompt below β you can give | |
| full instructions or just say what to change. The model will see both | |
| your prompt and the previous conversation.<br><br> | |
| <em>Examples:</em><br> | |
| β’ Make it shorter, max 20 turns<br> | |
| β’ Focus more on the methodology section<br> | |
| β’ Use a more informal tone<br> | |
| β’ The author name is Dr. Maria Ionescu β fix the intro | |
| </div> | |
| """) | |
| reprompt = gr.Textbox( | |
| label="", | |
| placeholder="Write your new prompt or instructions hereβ¦", | |
| lines=7, | |
| elem_classes=["reprompt-box"], | |
| ) | |
| regen_btn = gr.Button("π Regenerate with my prompt", elem_classes=["btn-secondary"]) | |
| gr.HTML(""" | |
| <div class="info-box" style="margin-top:16px;"> | |
| <strong>Format:</strong> [ANCHOR]: β¦ / [AUTHOR]: β¦<br> | |
| <strong>Model:</strong> Mistral-7B-Instruct (HF Inference API)<br> | |
| <strong>PDF limit:</strong> first ~6 000 words processed | |
| </div> | |
| """) | |
| # ββ Right: output + save ββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Column(scale=2): | |
| gr.HTML('<div class="panel-label"><span class="step-badge">3</span>Generated Conversation</div>') | |
| status_msg = gr.Textbox( | |
| label="", interactive=False, lines=1, max_lines=1, | |
| placeholder="Status will appear hereβ¦", elem_classes=["status-bar"], | |
| ) | |
| conversation_out = gr.Textbox( | |
| label="", lines=26, max_lines=60, interactive=True, | |
| show_copy_button=True, | |
| placeholder=( | |
| "The conversation will appear here after generation.\n" | |
| "You can also edit it manually before saving." | |
| ), | |
| elem_classes=["conversation-box"], | |
| ) | |
| gr.HTML('<hr class="section-div"><div class="panel-label"><span class="step-badge">4</span>Save to HuggingFace</div>') | |
| gr.HTML(""" | |
| <div class="info-box" style="margin-bottom:10px;"> | |
| Saved as a <code>.txt</code> file inside a private HuggingFace dataset repo.<br> | |
| Requires two secrets in <em>Space β Settings β Repository secrets</em>:<br> | |
| β’ <code>HF_TOKEN</code> β your HuggingFace write token<br> | |
| β’ <code>SAVE_REPO</code> β e.g. <code>yourname/research-conversations</code> | |
| </div> | |
| """) | |
| save_btn = gr.Button("πΎ Save conversation to HuggingFace", elem_classes=["btn-save"]) | |
| save_status = gr.HTML("") | |
| # ββ Events ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gen_btn.click( | |
| fn=generate_conversation, | |
| inputs=[pdf_input], | |
| outputs=[conversation_out, status_msg, article_cache], | |
| show_progress=True, | |
| ) | |
| regen_btn.click( | |
| fn=regenerate_conversation, | |
| inputs=[reprompt, conversation_out, article_cache, pdf_input], | |
| outputs=[conversation_out, status_msg, article_cache], | |
| show_progress=True, | |
| ) | |
| def render_save(conv, pdf): | |
| msg = save_to_hub(conv, pdf) | |
| ok = msg.startswith("β ") | |
| bg = "#eef4ff" if ok else "#fff0f0" | |
| br = "#bfd2f8" if ok else "#f8bfbf" | |
| tc = "#1e3a8a" if ok else "#991b1b" | |
| return f'<div style="font-size:.88rem;padding:10px 14px;border-radius:5px;background:{bg};border:1px solid {br};color:{tc};word-break:break-all;margin-top:4px;">{msg}</div>' | |
| save_btn.click( | |
| fn=render_save, | |
| inputs=[conversation_out, pdf_input], | |
| outputs=[save_status], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |