Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import google.generativeai as genai | |
| import gradio as gr | |
| def fetch_article_content(url): | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" | |
| } | |
| try: | |
| resp = requests.get(url, headers=headers, timeout=10) | |
| except Exception: | |
| return None, None, (None, None) | |
| if resp.status_code != 200 or 'text/html' not in resp.headers.get('Content-Type', ''): | |
| return None, None, (None, None) | |
| soup = BeautifulSoup(resp.text, 'html.parser') | |
| for tag in soup(['script', 'style', 'header', 'footer', 'noscript', 'form', 'nav', 'aside']): | |
| tag.decompose() | |
| title_tag = soup.find('h1') or soup.title | |
| title = title_tag.get_text().strip() if title_tag else "Untitled" | |
| if title.endswith(" - Wikipedia"): | |
| title = title.replace(" - Wikipedia", "") | |
| content_div = soup.find('div', {'class': 'mw-parser-output'}) or soup.body | |
| if content_div: | |
| for ref in content_div.find_all('sup', {'class': 'reference'}): | |
| ref.decompose() | |
| for ref_list in content_div.find_all(['ol', 'ul'], {'class': 'references'}): | |
| ref_list.decompose() | |
| paragraphs = content_div.find_all('p') | |
| text_content = "\n\n".join(p.get_text().strip() for p in paragraphs if p.get_text().strip()) | |
| else: | |
| text_content = soup.get_text(separator="\n") | |
| text_content = text_content.strip() | |
| img_url, img_alt = None, "" | |
| imgs = content_div.find_all('img') if content_div else soup.find_all('img') | |
| for img in imgs: | |
| src = img.get('src', '') | |
| alt = img.get('alt', '') | |
| if not src: | |
| continue | |
| if "upload" in src or "commons" in src or "wikipedia" in src: | |
| img_url = src | |
| img_alt = alt if alt else "" | |
| break | |
| if alt.lower() not in ["logo", "icon"]: | |
| img_url = src | |
| img_alt = alt if alt else "" | |
| break | |
| if img_url: | |
| if img_url.startswith("//"): | |
| img_url = "https:" + img_url | |
| elif img_url.startswith("/"): | |
| from urllib.parse import urljoin | |
| img_url = urljoin(url, img_url) | |
| if not img_alt: | |
| from urllib.parse import unquote | |
| fname = unquote(img_url.split('/')[-1]) | |
| fname = re.sub(r'^\d+px-', '', fname) | |
| fname = re.sub(r'\.[A-Za-z0-9]+$', '', fname) | |
| img_alt = fname.replace('_', ' ').strip() | |
| if not img_alt: | |
| img_alt = "Image" | |
| return title, text_content, (img_url, img_alt) | |
| def generate_post(platform, title, content, model): | |
| platform = platform.lower() | |
| if platform == "reddit": | |
| style_instructions = ( | |
| "an informal, conversational tone, as if posting on Reddit. " | |
| "Format the response using HTML tags for paragraphs and lists, " | |
| "but do not wrap it in triple backticks or ```html code blocks." | |
| ) | |
| elif platform == "quora": | |
| style_instructions = ( | |
| "a clear, detailed explanatory tone, as if answering on Quora. " | |
| "Use proper HTML for readability, without wrapping in code blocks." | |
| ) | |
| else: | |
| style_instructions = "a clear and accessible tone" | |
| prompt = ( | |
| f"Transform the following article content into {style_instructions}.\n" | |
| f"Output the result in valid HTML format with proper paragraphs (and lists if needed).\n" | |
| f"Do NOT include the title or image — only the body content in HTML.\n\n" | |
| f"Article Title: {title}\n" | |
| f"Article Content:\n\"\"\"\n{content}\n\"\"\"" | |
| ) | |
| try: | |
| response = model.generate_content(prompt) | |
| except Exception as e: | |
| return f"<p><em>Error: failed to generate {platform} content ({e})</em></p>" | |
| return response.text.strip() | |
| def process_url(url, api_key): | |
| if not api_key: | |
| error_msg = "<p><em>API key is required.</em></p>" | |
| return error_msg, error_msg | |
| try: | |
| genai.configure(api_key=api_key) | |
| model = genai.GenerativeModel('gemini-1.5-pro-latest') | |
| except Exception as e: | |
| error_msg = f"<p><em>Failed to configure Gemini API: {e}</em></p>" | |
| return error_msg, error_msg | |
| title, content, (img_url, img_alt) = fetch_article_content(url) | |
| if not content: | |
| error_msg = f"<p><em>Could not retrieve content from this URL: <a href='{url}'>{url}</a></em></p>" | |
| return error_msg, error_msg | |
| reddit_body = generate_post("reddit", title, content, model) | |
| quora_body = generate_post("quora", title, content, model) | |
| source_html = f'<p><small><em>Source: <a href="{url}" target="_blank">{url}</a></em></small></p>' | |
| reddit_html = f"<h2>{title}</h2>\n" | |
| quora_html = f"<h2>{title}</h2>\n" | |
| if img_url: | |
| img_tag = f'<img src="{img_url}" alt="{img_alt}" style="max-width:100%; height:auto;" />\n' | |
| reddit_html += img_tag | |
| quora_html += img_tag | |
| reddit_html += reddit_body + source_html | |
| quora_html += quora_body + source_html | |
| return reddit_html, quora_html | |
| # Gradio interface | |
| demo = gr.Interface( | |
| fn=process_url, | |
| inputs=[ | |
| gr.Textbox(label="Article URL", placeholder="https://en.wikipedia.org/wiki/Kefir"), | |
| gr.Textbox(label="Gemini API Key", placeholder="Paste your Gemini API key here", type="password") | |
| ], | |
| outputs=[ | |
| gr.HTML(label="Reddit-formatted Post"), | |
| gr.HTML(label="Quora-formatted Post") | |
| ], | |
| title="Article → Reddit & Quora Post Generator", | |
| description="Enter an article link and your Gemini API key to generate Reddit- and Quora-style posts in HTML." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |