Spaces:

mroccuper
/

Article-to-Post

Sleeping

App Files Files Community

Article-to-Post / app.py

mroccuper

Update app.py

fee5da3 verified 8 months ago

raw

history blame contribute delete

5.73 kB

	import os
	import re
	import requests
	from bs4 import BeautifulSoup
	import google.generativeai as genai
	import gradio as gr

	def fetch_article_content(url):
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
	}

	try:
	resp = requests.get(url, headers=headers, timeout=10)
	except Exception:
	return None, None, (None, None)

	if resp.status_code != 200 or 'text/html' not in resp.headers.get('Content-Type', ''):
	return None, None, (None, None)

	soup = BeautifulSoup(resp.text, 'html.parser')
	for tag in soup(['script', 'style', 'header', 'footer', 'noscript', 'form', 'nav', 'aside']):
	tag.decompose()

	title_tag = soup.find('h1') or soup.title
	title = title_tag.get_text().strip() if title_tag else "Untitled"
	if title.endswith(" - Wikipedia"):
	title = title.replace(" - Wikipedia", "")

	content_div = soup.find('div', {'class': 'mw-parser-output'}) or soup.body
	if content_div:
	for ref in content_div.find_all('sup', {'class': 'reference'}):
	ref.decompose()
	for ref_list in content_div.find_all(['ol', 'ul'], {'class': 'references'}):
	ref_list.decompose()
	paragraphs = content_div.find_all('p')
	text_content = "\n\n".join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
	else:
	text_content = soup.get_text(separator="\n")

	text_content = text_content.strip()

	img_url, img_alt = None, ""
	imgs = content_div.find_all('img') if content_div else soup.find_all('img')
	for img in imgs:
	src = img.get('src', '')
	alt = img.get('alt', '')
	if not src:
	continue
	if "upload" in src or "commons" in src or "wikipedia" in src:
	img_url = src
	img_alt = alt if alt else ""
	break
	if alt.lower() not in ["logo", "icon"]:
	img_url = src
	img_alt = alt if alt else ""
	break

	if img_url:
	if img_url.startswith("//"):
	img_url = "https:" + img_url
	elif img_url.startswith("/"):
	from urllib.parse import urljoin
	img_url = urljoin(url, img_url)

	if not img_alt:
	from urllib.parse import unquote
	fname = unquote(img_url.split('/')[-1])
	fname = re.sub(r'^\d+px-', '', fname)
	fname = re.sub(r'\.[A-Za-z0-9]+$', '', fname)
	img_alt = fname.replace('_', ' ').strip()
	if not img_alt:
	img_alt = "Image"

	return title, text_content, (img_url, img_alt)

	def generate_post(platform, title, content, model):
	platform = platform.lower()

	if platform == "reddit":
	style_instructions = (
	"an informal, conversational tone, as if posting on Reddit. "
	"Format the response using HTML tags for paragraphs and lists, "
	"but do not wrap it in triple backticks or ```html code blocks."
	)
	elif platform == "quora":
	style_instructions = (
	"a clear, detailed explanatory tone, as if answering on Quora. "
	"Use proper HTML for readability, without wrapping in code blocks."
	)
	else:
	style_instructions = "a clear and accessible tone"

	prompt = (
	f"Transform the following article content into {style_instructions}.\n"
	f"Output the result in valid HTML format with proper paragraphs (and lists if needed).\n"
	f"Do NOT include the title or image — only the body content in HTML.\n\n"
	f"Article Title: {title}\n"
	f"Article Content:\n\"\"\"\n{content}\n\"\"\""
	)

	try:
	response = model.generate_content(prompt)
	except Exception as e:
	return f"<p><em>Error: failed to generate {platform} content ({e})</em></p>"

	return response.text.strip()

	def process_url(url, api_key):
	if not api_key:
	error_msg = "<p><em>API key is required.</em></p>"
	return error_msg, error_msg

	try:
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-1.5-pro-latest')
	except Exception as e:
	error_msg = f"<p><em>Failed to configure Gemini API: {e}</em></p>"
	return error_msg, error_msg

	title, content, (img_url, img_alt) = fetch_article_content(url)
	if not content:
	error_msg = f"<p><em>Could not retrieve content from this URL: <a href='{url}'>{url}</a></em></p>"
	return error_msg, error_msg

	reddit_body = generate_post("reddit", title, content, model)
	quora_body = generate_post("quora", title, content, model)

	source_html = f'<p><small><em>Source: <a href="{url}" target="_blank">{url}</a></em></small></p>'

	reddit_html = f"<h2>{title}</h2>\n"
	quora_html = f"<h2>{title}</h2>\n"

	if img_url:
	img_tag = f'<img src="{img_url}" alt="{img_alt}" style="max-width:100%; height:auto;" />\n'
	reddit_html += img_tag
	quora_html += img_tag

	reddit_html += reddit_body + source_html
	quora_html += quora_body + source_html

	return reddit_html, quora_html

	# Gradio interface
	demo = gr.Interface(
	fn=process_url,
	inputs=[
	gr.Textbox(label="Article URL", placeholder="https://en.wikipedia.org/wiki/Kefir"),
	gr.Textbox(label="Gemini API Key", placeholder="Paste your Gemini API key here", type="password")
	],
	outputs=[
	gr.HTML(label="Reddit-formatted Post"),
	gr.HTML(label="Quora-formatted Post")
	],
	title="Article → Reddit & Quora Post Generator",
	description="Enter an article link and your Gemini API key to generate Reddit- and Quora-style posts in HTML."
	)

	if __name__ == "__main__":
	demo.launch()