Spaces:

catalinmoisan
/

ground_truth_generator

Runtime error

App Files Files Community

ground_truth_generator / app.py

catalinmoisan

Upload app.py

ce8871a verified about 2 months ago

raw

history blame contribute delete

17.9 kB

	import gradio as gr
	import fitz # PyMuPDF
	from huggingface_hub import InferenceClient, HfApi
	import datetime
	import os

	# ── Config ─────────────────────────────────────────────────────────────────────
	MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
	HF_TOKEN = os.environ.get("HF_TOKEN", "") # Space → Settings → Secrets
	SAVE_REPO = os.environ.get("SAVE_REPO", "") # e.g. "username/research-conversations"

	# ── PDF extraction ─────────────────────────────────────────────────────────────
	def extract_pdf_text(pdf_file) -> str:
	if pdf_file is None:
	return ""
	doc = fitz.open(pdf_file.name)
	pages = [page.get_text() for page in doc]
	doc.close()
	text = "\n\n".join(pages).strip()
	words = text.split()
	if len(words) > 6000:
	text = " ".join(words[:6000]) + "\n\n[... article truncated ...]"
	return text

	# ── Fixed first-generation prompt ─────────────────────────────────────────────
	def build_first_prompt(article_text: str) -> str:
	return f"""Generate a conversation between an anchor and an author based on the article below.
	The conversation should cover the main points of the article in a question-and-answer format.
	Make it as long as possible, but keep it relevant to the article content and to not exceed 40-50 turns.
	Each turn will start with the participant's role in square brackets, followed by a colon and their utterance. Make sure that all utterances have the speaker annotation.:
	[ANCHOR]: ...
	[AUTHOR]: ...
	Only one utterance per turn. Use only information from the article.
	Do not invent facts not found in the article.
	Make the conversation engaging and informative.
	Make it sound natural and human-like.
	Ignore the Acknowledgment section of the article.
	Ignore the links in the article.
	Ignore the references in the article.
	Try to discuss a little bit about the results presented in the tables of the articles.
	Make the first part of the entry utterance using: Good day everyone, welcome to our show. Today we have with us [Author's Name], the author of the article. Thank you for joining us.
	Make sure that the closing is natural, not leaving a question in the air.
	Make sure that there are only replies from the ANCHOR and AUTHOR, no other speakers or sentences added.
	VERY IMPORTANT: THE NR OF TURNS SHOULD BE EQUAL FOR BOTH ANCHOR AND AUTHOR.
	DO NOT GENERATE ANYTHING ELSE. ONLY PROVIDE THE CONVERSATION WITH THE ANNOTATIONS.
	Here is the
	ARTICLE:
	{article_text}
	Begin conversation:"""

	# ── Re-generation: user writes their own full prompt ──────────────────────────
	def build_reprompt(article_text: str, user_prompt: str, previous_conv: str) -> str:
	msg = user_prompt.strip()
	msg += f"\n\nARTICLE:\n{article_text}"
	if previous_conv.strip():
	msg += f"\n\nPREVIOUS CONVERSATION (for reference):\n{previous_conv.strip()}"
	msg += "\n\nBegin conversation:"
	return msg

	# ── Call model ─────────────────────────────────────────────────────────────────
	SYSTEM = (
	"You are a professional science-news podcast producer. "
	"You generate ONLY structured dialogues in the exact format requested by the user. "
	"Never add explanations, preamble, or text outside the conversation turns."
	)

	def call_model(prompt: str, temperature: float = 0.7) -> str:
	client = InferenceClient(MODEL_ID, token=HF_TOKEN or None)
	result = client.chat_completion(
	messages=[
	{"role": "system", "content": SYSTEM},
	{"role": "user", "content": prompt},
	],
	max_tokens=3000,
	temperature=temperature,
	)
	return result.choices[0].message.content.strip()

	# ── Generate (first time) ──────────────────────────────────────────────────────
	def generate_conversation(pdf_file, progress=gr.Progress()):
	if pdf_file is None:
	return "", "⚠️ Please upload a PDF first.", ""

	progress(0.1, desc="Extracting text from PDF…")
	article_text = extract_pdf_text(pdf_file)
	if not article_text:
	return "", "⚠️ Could not extract text. Make sure the PDF is not scanned or password-protected.", ""

	progress(0.35, desc="Sending to model…")
	try:
	conversation = call_model(build_first_prompt(article_text))
	except Exception as e:
	return "", f"❌ Model error: {e}", ""

	turns = len([l for l in conversation.split("\n")
	if l.strip().startswith("[ANCHOR]") or l.strip().startswith("[AUTHOR]")])
	info = f"✅ Generated — {turns} turns · {len(conversation.split())} words"
	progress(1.0)
	return conversation, info, article_text

	# ── Re-generate with user's custom prompt ─────────────────────────────────────
	def regenerate_conversation(user_prompt, current_conv, article_cache, pdf_file, progress=gr.Progress()):
	if not user_prompt.strip():
	return current_conv, "⚠️ Write your prompt/instructions before regenerating.", article_cache

	article_text = article_cache
	if not article_text.strip():
	if pdf_file is None:
	return current_conv, "⚠️ Upload the PDF first.", ""
	progress(0.1, desc="Extracting PDF…")
	article_text = extract_pdf_text(pdf_file)
	if not article_text:
	return current_conv, "⚠️ Could not extract PDF text.", ""

	progress(0.3, desc="Sending new prompt to model…")
	try:
	conversation = call_model(
	build_reprompt(article_text, user_prompt, current_conv),
	temperature=0.75,
	)
	except Exception as e:
	return current_conv, f"❌ Model error: {e}", article_text

	turns = len([l for l in conversation.split("\n")
	if l.strip().startswith("[ANCHOR]") or l.strip().startswith("[AUTHOR]")])
	info = f"✅ Regenerated — {turns} turns · {len(conversation.split())} words"
	progress(1.0)
	return conversation, info, article_text

	# ── Save to HuggingFace Hub ────────────────────────────────────────────────────
	def save_to_hub(conversation: str, pdf_file):
	if not conversation.strip():
	return "⚠️ Nothing to save — generate a conversation first."
	if not HF_TOKEN:
	return "⚠️ HF_TOKEN secret not set. Add it in Space → Settings → Repository secrets."
	if not SAVE_REPO:
	return "⚠️ SAVE_REPO secret not set. Add it as 'username/your-dataset-repo'."

	ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
	pdf_name = "article"
	if pdf_file is not None:
	pdf_name = os.path.basename(pdf_file.name).replace(".pdf", "")

	path_in_repo = f"conversations/{pdf_name}_{ts}.txt"
	header = (
	f"# Research News Podcast Conversation\n"
	f"Source article: {pdf_name}\n"
	f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}\n"
	f"Model: {MODEL_ID}\n"
	f"{'─'*60}\n\n"
	)
	content = (header + conversation).encode("utf-8")

	try:
	api = HfApi(token=HF_TOKEN)
	api.create_repo(repo_id=SAVE_REPO, repo_type="dataset", exist_ok=True, private=True)
	api.upload_file(
	path_or_fileobj=content,
	path_in_repo=path_in_repo,
	repo_id=SAVE_REPO,
	repo_type="dataset",
	commit_message=f"Add conversation: {pdf_name}_{ts}",
	)
	url = f"https://huggingface.co/datasets/{SAVE_REPO}/blob/main/{path_in_repo}"
	return f"✅ Saved → {url}"
	except Exception as e:
	return f"❌ Save error: {e}"

	# ── CSS ────────────────────────────────────────────────────────────────────────
	CUSTOM_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@400;700;900&family=DM+Sans:wght@300;400;500&display=swap');

	:root {
	--ink: #0e0c0a;
	--paper: #f5f0e8;
	--accent: #c8392b;
	--accent2: #1d4ed8;
	--muted: #7a7060;
	--border: #d4cfc4;
	--card: #fffdf8;
	}
	body, .gradio-container {
	background: var(--paper) !important;
	font-family: 'DM Sans', sans-serif !important;
	}
	.gradio-container { max-width: 1040px !important; margin: 0 auto !important; }

	.app-header { text-align:center; padding:36px 20px 22px; border-bottom:2.5px solid var(--ink); margin-bottom:28px; }
	.app-header h1 { font-family:'Playfair Display',Georgia,serif; font-size:2.5rem; font-weight:900; color:var(--ink); letter-spacing:-.02em; margin:0 0 4px; }
	.app-header .kicker { font-size:.75rem; letter-spacing:.18em; text-transform:uppercase; color:var(--accent); font-weight:500; margin-bottom:8px; }
	.app-header .subtitle { color:var(--muted); font-size:.95rem; max-width:560px; margin:0 auto; line-height:1.65; }

	.panel-label { font-family:'Playfair Display',serif; font-size:1rem; font-weight:700; color:var(--ink); border-bottom:1px solid var(--border); padding-bottom:7px; margin-bottom:10px; display:flex; align-items:center; gap:8px; }
	.step-badge { background:var(--ink); color:var(--paper); border-radius:50%; width:21px; height:21px; display:inline-flex; align-items:center; justify-content:center; font-size:.68rem; font-family:'DM Sans',sans-serif; font-weight:600; flex-shrink:0; }

	.upload-zone { border:2px dashed var(--border) !important; border-radius:6px !important; background:var(--card) !important; transition:border-color .2s; }
	.upload-zone:hover { border-color:var(--accent) !important; }

	button.btn-primary { background:var(--ink) !important; color:var(--paper) !important; border:none !important; border-radius:4px !important; font-family:'DM Sans',sans-serif !important; font-weight:500 !important; letter-spacing:.04em !important; transition:background .2s !important; }
	button.btn-primary:hover { background:var(--accent) !important; }

	button.btn-secondary { background:transparent !important; color:var(--ink) !important; border:1.5px solid var(--ink) !important; border-radius:4px !important; font-family:'DM Sans',sans-serif !important; font-weight:500 !important; transition:all .2s !important; }
	button.btn-secondary:hover { background:var(--ink) !important; color:var(--paper) !important; }

	button.btn-save { background:var(--accent2) !important; color:white !important; border:none !important; border-radius:4px !important; font-family:'DM Sans',sans-serif !important; font-weight:500 !important; transition:background .2s !important; }
	button.btn-save:hover { background:#1e3a8a !important; }

	.conversation-box textarea { font-family:'DM Mono','Courier New',monospace !important; font-size:.9rem !important; line-height:1.8 !important; background:var(--card) !important; border:1px solid var(--border) !important; border-radius:6px !important; color:var(--ink) !important; padding:14px !important; }

	.reprompt-box textarea { font-size:.92rem !important; line-height:1.6 !important; border:1.5px solid var(--accent2) !important; border-radius:6px !important; background:#f0f4ff !important; }

	.status-bar { font-size:.83rem; color:var(--muted); padding:4px 0; font-style:italic; }
	.section-div { border:none; border-top:1px solid var(--border); margin:20px 0; }
	.info-box { background:var(--card); border:1px solid var(--border); border-radius:6px; padding:12px 16px; font-size:.85rem; color:var(--muted); line-height:1.6; }
	"""

	# ── UI ─────────────────────────────────────────────────────────────────────────
	with gr.Blocks(css=CUSTOM_CSS, title="Research News Podcast Generator") as demo:

	article_cache = gr.State("")

	gr.HTML("""
	<div class="app-header">
	<div class="kicker">🎙 Tool for Research Authors</div>
	<h1>Research News<br>Podcast Generator</h1>
	<p class="subtitle">
	Upload your scientific article and generate an engaging anchor–author dialogue
	for your research news show. Refine with your own prompt, then save to HuggingFace.
	</p>
	</div>
	""")

	with gr.Row(equal_height=False):

	# ── Left: upload + reprompt ───────────────────────────────────────────
	with gr.Column(scale=1, min_width=300):

	gr.HTML('<div class="panel-label"><span class="step-badge">1</span>Upload Article PDF</div>')
	pdf_input = gr.File(label="", file_types=[".pdf"], elem_classes=["upload-zone"])
	gen_btn = gr.Button("⚡ Generate Conversation", elem_classes=["btn-primary"], variant="primary")

	gr.HTML('<hr class="section-div"><div class="panel-label"><span class="step-badge">2</span>Refine — write your own prompt</div>')
	gr.HTML("""
	<div class="info-box" style="margin-bottom:10px;">
	Not happy with the result? Write your own prompt below — you can give
	full instructions or just say what to change. The model will see both
	your prompt and the previous conversation.<br><br>
	<em>Examples:</em><br>
	• Make it shorter, max 20 turns<br>
	• Focus more on the methodology section<br>
	• Use a more informal tone<br>
	• The author name is Dr. Maria Ionescu — fix the intro
	</div>
	""")
	reprompt = gr.Textbox(
	label="",
	placeholder="Write your new prompt or instructions here…",
	lines=7,
	elem_classes=["reprompt-box"],
	)
	regen_btn = gr.Button("🔄 Regenerate with my prompt", elem_classes=["btn-secondary"])

	gr.HTML("""
	<div class="info-box" style="margin-top:16px;">
	<strong>Format:</strong> [ANCHOR]: … / [AUTHOR]: …<br>
	<strong>Model:</strong> Mistral-7B-Instruct (HF Inference API)<br>
	<strong>PDF limit:</strong> first ~6 000 words processed
	</div>
	""")

	# ── Right: output + save ──────────────────────────────────────────────
	with gr.Column(scale=2):

	gr.HTML('<div class="panel-label"><span class="step-badge">3</span>Generated Conversation</div>')
	status_msg = gr.Textbox(
	label="", interactive=False, lines=1, max_lines=1,
	placeholder="Status will appear here…", elem_classes=["status-bar"],
	)
	conversation_out = gr.Textbox(
	label="", lines=26, max_lines=60, interactive=True,
	show_copy_button=True,
	placeholder=(
	"The conversation will appear here after generation.\n"
	"You can also edit it manually before saving."
	),
	elem_classes=["conversation-box"],
	)

	gr.HTML('<hr class="section-div"><div class="panel-label"><span class="step-badge">4</span>Save to HuggingFace</div>')
	gr.HTML("""
	<div class="info-box" style="margin-bottom:10px;">
	Saved as a <code>.txt</code> file inside a private HuggingFace dataset repo.<br>
	Requires two secrets in <em>Space → Settings → Repository secrets</em>:<br>
	• <code>HF_TOKEN</code> — your HuggingFace write token<br>
	• <code>SAVE_REPO</code> — e.g. <code>yourname/research-conversations</code>
	</div>
	""")
	save_btn = gr.Button("💾 Save conversation to HuggingFace", elem_classes=["btn-save"])
	save_status = gr.HTML("")

	# ── Events ────────────────────────────────────────────────────────────────
	gen_btn.click(
	fn=generate_conversation,
	inputs=[pdf_input],
	outputs=[conversation_out, status_msg, article_cache],
	show_progress=True,
	)

	regen_btn.click(
	fn=regenerate_conversation,
	inputs=[reprompt, conversation_out, article_cache, pdf_input],
	outputs=[conversation_out, status_msg, article_cache],
	show_progress=True,
	)

	def render_save(conv, pdf):
	msg = save_to_hub(conv, pdf)
	ok = msg.startswith("✅")
	bg = "#eef4ff" if ok else "#fff0f0"
	br = "#bfd2f8" if ok else "#f8bfbf"
	tc = "#1e3a8a" if ok else "#991b1b"
	return f'<div style="font-size:.88rem;padding:10px 14px;border-radius:5px;background:{bg};border:1px solid {br};color:{tc};word-break:break-all;margin-top:4px;">{msg}</div>'

	save_btn.click(
	fn=render_save,
	inputs=[conversation_out, pdf_input],
	outputs=[save_status],
	)

	if __name__ == "__main__":
	demo.launch()