Spaces:

catalinmoisan
/

ground_truth_generator2

Build error

App Files Files Community

ground_truth_generator2 / app.py

catalinmoisan

Upload 3 files

fdb90d2 verified about 1 month ago

raw

history blame contribute delete

10.7 kB

	import gradio as gr
	import fitz # PyMuPDF
	from huggingface_hub import InferenceClient, HfApi
	import datetime
	import os

	# ── Config ─────────────────────────────────────────────────────────────────────
	MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
	HF_TOKEN = os.environ.get("HF_TOKEN", "")
	SAVE_REPO = os.environ.get("SAVE_REPO", "")

	# ── PDF extraction ─────────────────────────────────────────────────────────────
	def extract_pdf_text(pdf_file) -> str:
	if pdf_file is None:
	return ""
	doc = fitz.open(pdf_file.name)
	pages = [page.get_text() for page in doc]
	doc.close()
	text = "\n\n".join(pages).strip()
	words = text.split()
	if len(words) > 6000:
	text = " ".join(words[:6000]) + "\n\n[... article truncated ...]"
	return text

	# ── Prompts ────────────────────────────────────────────────────────────────────
	def build_first_prompt(article_text: str) -> str:
	return f"""Generate a conversation between an anchor and an author based on the article below.
	The conversation should cover the main points of the article in a question-and-answer format.
	Make it as long as possible, but keep it relevant to the article content and to not exceed 40-50 turns.
	Each turn will start with the participant's role in square brackets, followed by a colon and their utterance. Make sure that all utterances have the speaker annotation.:
	[ANCHOR]: ...
	[AUTHOR]: ...
	Only one utterance per turn. Use only information from the article.
	Do not invent facts not found in the article.
	Make the conversation engaging and informative.
	Make it sound natural and human-like.
	Ignore the Acknowledgment section of the article.
	Ignore the links in the article.
	Ignore the references in the article.
	Try to discuss a little bit about the results presented in the tables of the articles.
	Make the first part of the entry utterance using: Good day everyone, welcome to our show. Today we have with us [Author's Name], the author of the article. Thank you for joining us.
	Make sure that the closing is natural, not leaving a question in the air.
	Make sure that there are only replies from the ANCHOR and AUTHOR, no other speakers or sentences added.
	VERY IMPORTANT: THE NR OF TURNS SHOULD BE EQUAL FOR BOTH ANCHOR AND AUTHOR.
	DO NOT GENERATE ANYTHING ELSE. ONLY PROVIDE THE CONVERSATION WITH THE ANNOTATIONS.
	Here is the
	ARTICLE:
	{article_text}
	Begin conversation:"""

	def build_reprompt(article_text: str, user_prompt: str, previous_conv: str) -> str:
	msg = user_prompt.strip()
	msg += f"\n\nARTICLE:\n{article_text}"
	if previous_conv.strip():
	msg += f"\n\nPREVIOUS CONVERSATION (for reference):\n{previous_conv.strip()}"
	msg += "\n\nBegin conversation:"
	return msg

	# ── Model call ─────────────────────────────────────────────────────────────────
	SYSTEM = (
	"You are a professional science-news podcast producer. "
	"You generate ONLY structured dialogues in the exact format requested by the user. "
	"Never add explanations, preamble, or text outside the conversation turns."
	)

	def call_model(prompt: str, temperature: float = 0.7) -> str:
	client = InferenceClient(MODEL_ID, token=HF_TOKEN or None)
	result = client.chat_completion(
	messages=[
	{"role": "system", "content": SYSTEM},
	{"role": "user", "content": prompt},
	],
	max_tokens=3000,
	temperature=temperature,
	)
	return result.choices[0].message.content.strip()

	# ── Handlers ───────────────────────────────────────────────────────────────────
	def generate_conversation(pdf_file):
	if pdf_file is None:
	return "", "⚠️ Please upload a PDF first.", ""
	article_text = extract_pdf_text(pdf_file)
	if not article_text:
	return "", "⚠️ Could not extract text. Make sure the PDF is not scanned or password-protected.", ""
	try:
	conversation = call_model(build_first_prompt(article_text))
	except Exception as e:
	return "", f"❌ Model error: {e}", ""
	turns = len([l for l in conversation.split("\n")
	if l.strip().startswith("[ANCHOR]") or l.strip().startswith("[AUTHOR]")])
	return conversation, f"✅ Generated — {turns} turns · {len(conversation.split())} words", article_text


	def regenerate_conversation(user_prompt, current_conv, article_cache, pdf_file):
	if not user_prompt.strip():
	return current_conv, "⚠️ Write your prompt/instructions before regenerating.", article_cache
	article_text = article_cache
	if not article_text.strip():
	if pdf_file is None:
	return current_conv, "⚠️ Upload the PDF first.", ""
	article_text = extract_pdf_text(pdf_file)
	if not article_text:
	return current_conv, "⚠️ Could not extract PDF text.", ""
	try:
	conversation = call_model(build_reprompt(article_text, user_prompt, current_conv), temperature=0.75)
	except Exception as e:
	return current_conv, f"❌ Model error: {e}", article_text
	turns = len([l for l in conversation.split("\n")
	if l.strip().startswith("[ANCHOR]") or l.strip().startswith("[AUTHOR]")])
	return conversation, f"✅ Regenerated — {turns} turns · {len(conversation.split())} words", article_text


	def save_to_hub(conversation: str, pdf_file):
	if not conversation.strip():
	return "⚠️ Nothing to save — generate a conversation first."
	if not HF_TOKEN:
	return "⚠️ HF_TOKEN secret not set."
	if not SAVE_REPO:
	return "⚠️ SAVE_REPO secret not set."
	ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
	pdf_name = "article"
	if pdf_file is not None:
	pdf_name = os.path.basename(pdf_file.name).replace(".pdf", "")
	path_in_repo = f"conversations/{pdf_name}_{ts}.txt"
	header = (
	f"# Research News Podcast Conversation\n"
	f"Source: {pdf_name}\n"
	f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}\n"
	f"Model: {MODEL_ID}\n"
	f"{'─'*60}\n\n"
	)
	content = (header + conversation).encode("utf-8")
	try:
	api = HfApi(token=HF_TOKEN)
	api.create_repo(repo_id=SAVE_REPO, repo_type="dataset", exist_ok=True, private=True)
	api.upload_file(
	path_or_fileobj=content,
	path_in_repo=path_in_repo,
	repo_id=SAVE_REPO,
	repo_type="dataset",
	commit_message=f"Add: {pdf_name}_{ts}",
	)
	return f"✅ Saved → https://huggingface.co/datasets/{SAVE_REPO}/blob/main/{path_in_repo}"
	except Exception as e:
	return f"❌ Save error: {e}"

	# ── UI ─────────────────────────────────────────────────────────────────────────
	with gr.Blocks(title="Research News Podcast Generator") as demo:

	article_cache = gr.State("")

	gr.Markdown("""
	# 🎙️ Research News Podcast Generator
	Upload your scientific article and generate an engaging [ANCHOR] / [AUTHOR] dialogue for your research news show.
	Refine with your own prompt, then save directly to HuggingFace.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 1 · Upload Article PDF")
	pdf_input = gr.File(label="PDF file", file_types=[".pdf"])
	gen_btn = gr.Button("⚡ Generate Conversation", variant="primary")

	gr.Markdown("### 2 · Refine with your own prompt")
	gr.Markdown(
	"_Not happy with the result? Write your own prompt or just say what to change. "
	"The model will see your prompt + the article + the previous conversation._"
	)
	reprompt = gr.Textbox(
	label="Your prompt / instructions",
	placeholder=(
	"Examples:\n"
	"• Make it shorter, max 20 turns\n"
	"• Focus more on the methodology\n"
	"• The author name is Dr. Maria Ionescu — fix the intro\n"
	"• Use a more informal tone"
	),
	lines=6,
	)
	regen_btn = gr.Button("🔄 Regenerate with my prompt")

	with gr.Column(scale=2):
	gr.Markdown("### 3 · Generated Conversation")
	status_msg = gr.Textbox(label="Status", interactive=False, lines=1)
	conversation_out = gr.Textbox(
	label="Conversation (editable)",
	lines=25,
	max_lines=60,
	interactive=True,
	show_copy_button=True,
	placeholder="The conversation will appear here.\nYou can also edit it manually before saving.",
	)

	gr.Markdown("### 4 · Save to HuggingFace")
	gr.Markdown(
	"Requires two secrets in Space → Settings → Repository secrets: \n"
	"`HF_TOKEN` — HuggingFace write token \n"
	"`SAVE_REPO` — e.g. `yourname/research-conversations`"
	)
	save_btn = gr.Button("💾 Save conversation to HuggingFace", variant="secondary")
	save_status = gr.Textbox(label="Save status", interactive=False, lines=1)

	# ── Events ────────────────────────────────────────────────────────────────
	gen_btn.click(
	fn=generate_conversation,
	inputs=[pdf_input],
	outputs=[conversation_out, status_msg, article_cache],
	)
	regen_btn.click(
	fn=regenerate_conversation,
	inputs=[reprompt, conversation_out, article_cache, pdf_input],
	outputs=[conversation_out, status_msg, article_cache],
	)
	save_btn.click(
	fn=save_to_hub,
	inputs=[conversation_out, pdf_input],
	outputs=[save_status],
	)

	demo.launch()