Spaces:
Build error
Build error
| import gradio as gr | |
| import fitz # PyMuPDF | |
| from huggingface_hub import InferenceClient, HfApi | |
| import datetime | |
| import os | |
| # ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3" | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| SAVE_REPO = os.environ.get("SAVE_REPO", "") | |
| # ββ PDF extraction βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_pdf_text(pdf_file) -> str: | |
| if pdf_file is None: | |
| return "" | |
| doc = fitz.open(pdf_file.name) | |
| pages = [page.get_text() for page in doc] | |
| doc.close() | |
| text = "\n\n".join(pages).strip() | |
| words = text.split() | |
| if len(words) > 6000: | |
| text = " ".join(words[:6000]) + "\n\n[... article truncated ...]" | |
| return text | |
| # ββ Prompts ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_first_prompt(article_text: str) -> str: | |
| return f"""Generate a conversation between an anchor and an author based on the article below. | |
| The conversation should cover the main points of the article in a question-and-answer format. | |
| Make it as long as possible, but keep it relevant to the article content and to not exceed 40-50 turns. | |
| Each turn will start with the participant's role in square brackets, followed by a colon and their utterance. Make sure that all utterances have the speaker annotation.: | |
| [ANCHOR]: ... | |
| [AUTHOR]: ... | |
| Only one utterance per turn. Use only information from the article. | |
| Do not invent facts not found in the article. | |
| Make the conversation engaging and informative. | |
| Make it sound natural and human-like. | |
| Ignore the Acknowledgment section of the article. | |
| Ignore the links in the article. | |
| Ignore the references in the article. | |
| Try to discuss a little bit about the results presented in the tables of the articles. | |
| Make the first part of the entry utterance using: Good day everyone, welcome to our show. Today we have with us [Author's Name], the author of the article. Thank you for joining us. | |
| Make sure that the closing is natural, not leaving a question in the air. | |
| Make sure that there are only replies from the ANCHOR and AUTHOR, no other speakers or sentences added. | |
| VERY IMPORTANT: THE NR OF TURNS SHOULD BE EQUAL FOR BOTH ANCHOR AND AUTHOR. | |
| DO NOT GENERATE ANYTHING ELSE. ONLY PROVIDE THE CONVERSATION WITH THE ANNOTATIONS. | |
| Here is the | |
| ARTICLE: | |
| {article_text} | |
| Begin conversation:""" | |
| def build_reprompt(article_text: str, user_prompt: str, previous_conv: str) -> str: | |
| msg = user_prompt.strip() | |
| msg += f"\n\nARTICLE:\n{article_text}" | |
| if previous_conv.strip(): | |
| msg += f"\n\nPREVIOUS CONVERSATION (for reference):\n{previous_conv.strip()}" | |
| msg += "\n\nBegin conversation:" | |
| return msg | |
| # ββ Model call βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SYSTEM = ( | |
| "You are a professional science-news podcast producer. " | |
| "You generate ONLY structured dialogues in the exact format requested by the user. " | |
| "Never add explanations, preamble, or text outside the conversation turns." | |
| ) | |
| def call_model(prompt: str, temperature: float = 0.7) -> str: | |
| client = InferenceClient(MODEL_ID, token=HF_TOKEN or None) | |
| result = client.chat_completion( | |
| messages=[ | |
| {"role": "system", "content": SYSTEM}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| max_tokens=3000, | |
| temperature=temperature, | |
| ) | |
| return result.choices[0].message.content.strip() | |
| # ββ Handlers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_conversation(pdf_file): | |
| if pdf_file is None: | |
| return "", "β οΈ Please upload a PDF first.", "" | |
| article_text = extract_pdf_text(pdf_file) | |
| if not article_text: | |
| return "", "β οΈ Could not extract text. Make sure the PDF is not scanned or password-protected.", "" | |
| try: | |
| conversation = call_model(build_first_prompt(article_text)) | |
| except Exception as e: | |
| return "", f"β Model error: {e}", "" | |
| turns = len([l for l in conversation.split("\n") | |
| if l.strip().startswith("[ANCHOR]") or l.strip().startswith("[AUTHOR]")]) | |
| return conversation, f"β Generated β {turns} turns Β· {len(conversation.split())} words", article_text | |
| def regenerate_conversation(user_prompt, current_conv, article_cache, pdf_file): | |
| if not user_prompt.strip(): | |
| return current_conv, "β οΈ Write your prompt/instructions before regenerating.", article_cache | |
| article_text = article_cache | |
| if not article_text.strip(): | |
| if pdf_file is None: | |
| return current_conv, "β οΈ Upload the PDF first.", "" | |
| article_text = extract_pdf_text(pdf_file) | |
| if not article_text: | |
| return current_conv, "β οΈ Could not extract PDF text.", "" | |
| try: | |
| conversation = call_model(build_reprompt(article_text, user_prompt, current_conv), temperature=0.75) | |
| except Exception as e: | |
| return current_conv, f"β Model error: {e}", article_text | |
| turns = len([l for l in conversation.split("\n") | |
| if l.strip().startswith("[ANCHOR]") or l.strip().startswith("[AUTHOR]")]) | |
| return conversation, f"β Regenerated β {turns} turns Β· {len(conversation.split())} words", article_text | |
| def save_to_hub(conversation: str, pdf_file): | |
| if not conversation.strip(): | |
| return "β οΈ Nothing to save β generate a conversation first." | |
| if not HF_TOKEN: | |
| return "β οΈ HF_TOKEN secret not set." | |
| if not SAVE_REPO: | |
| return "β οΈ SAVE_REPO secret not set." | |
| ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | |
| pdf_name = "article" | |
| if pdf_file is not None: | |
| pdf_name = os.path.basename(pdf_file.name).replace(".pdf", "") | |
| path_in_repo = f"conversations/{pdf_name}_{ts}.txt" | |
| header = ( | |
| f"# Research News Podcast Conversation\n" | |
| f"Source: {pdf_name}\n" | |
| f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}\n" | |
| f"Model: {MODEL_ID}\n" | |
| f"{'β'*60}\n\n" | |
| ) | |
| content = (header + conversation).encode("utf-8") | |
| try: | |
| api = HfApi(token=HF_TOKEN) | |
| api.create_repo(repo_id=SAVE_REPO, repo_type="dataset", exist_ok=True, private=True) | |
| api.upload_file( | |
| path_or_fileobj=content, | |
| path_in_repo=path_in_repo, | |
| repo_id=SAVE_REPO, | |
| repo_type="dataset", | |
| commit_message=f"Add: {pdf_name}_{ts}", | |
| ) | |
| return f"β Saved β https://huggingface.co/datasets/{SAVE_REPO}/blob/main/{path_in_repo}" | |
| except Exception as e: | |
| return f"β Save error: {e}" | |
| # ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="Research News Podcast Generator") as demo: | |
| article_cache = gr.State("") | |
| gr.Markdown(""" | |
| # ποΈ Research News Podcast Generator | |
| Upload your scientific article and generate an engaging **[ANCHOR] / [AUTHOR]** dialogue for your research news show. | |
| Refine with your own prompt, then save directly to HuggingFace. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 1 Β· Upload Article PDF") | |
| pdf_input = gr.File(label="PDF file", file_types=[".pdf"]) | |
| gen_btn = gr.Button("β‘ Generate Conversation", variant="primary") | |
| gr.Markdown("### 2 Β· Refine with your own prompt") | |
| gr.Markdown( | |
| "_Not happy with the result? Write your own prompt or just say what to change. " | |
| "The model will see your prompt + the article + the previous conversation._" | |
| ) | |
| reprompt = gr.Textbox( | |
| label="Your prompt / instructions", | |
| placeholder=( | |
| "Examples:\n" | |
| "β’ Make it shorter, max 20 turns\n" | |
| "β’ Focus more on the methodology\n" | |
| "β’ The author name is Dr. Maria Ionescu β fix the intro\n" | |
| "β’ Use a more informal tone" | |
| ), | |
| lines=6, | |
| ) | |
| regen_btn = gr.Button("π Regenerate with my prompt") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### 3 Β· Generated Conversation") | |
| status_msg = gr.Textbox(label="Status", interactive=False, lines=1) | |
| conversation_out = gr.Textbox( | |
| label="Conversation (editable)", | |
| lines=25, | |
| max_lines=60, | |
| interactive=True, | |
| show_copy_button=True, | |
| placeholder="The conversation will appear here.\nYou can also edit it manually before saving.", | |
| ) | |
| gr.Markdown("### 4 Β· Save to HuggingFace") | |
| gr.Markdown( | |
| "Requires two secrets in **Space β Settings β Repository secrets**: \n" | |
| "`HF_TOKEN` β HuggingFace write token \n" | |
| "`SAVE_REPO` β e.g. `yourname/research-conversations`" | |
| ) | |
| save_btn = gr.Button("πΎ Save conversation to HuggingFace", variant="secondary") | |
| save_status = gr.Textbox(label="Save status", interactive=False, lines=1) | |
| # ββ Events ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gen_btn.click( | |
| fn=generate_conversation, | |
| inputs=[pdf_input], | |
| outputs=[conversation_out, status_msg, article_cache], | |
| ) | |
| regen_btn.click( | |
| fn=regenerate_conversation, | |
| inputs=[reprompt, conversation_out, article_cache, pdf_input], | |
| outputs=[conversation_out, status_msg, article_cache], | |
| ) | |
| save_btn.click( | |
| fn=save_to_hub, | |
| inputs=[conversation_out, pdf_input], | |
| outputs=[save_status], | |
| ) | |
| demo.launch() | |