Spaces:
Runtime error
Runtime error
| import os | |
| import openai | |
| import whisper | |
| import tempfile | |
| import gradio as gr | |
| from pydub import AudioSegment | |
| import fitz # PyMuPDF for handling PDFs | |
| import docx # For handling .docx files | |
| import pandas as pd # For handling .xlsx and .csv files | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from moviepy.editor import VideoFileClip | |
| import yt_dlp | |
| import logging | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Configure your OpenAI API key | |
| openai.api_key = os.getenv("OPENAI_API_KEY") | |
| # Load the highest quality Whisper model once | |
| model = whisper.load_model("large") | |
| def download_social_media_video(url): | |
| """Downloads a video from social media.""" | |
| ydl_opts = { | |
| 'format': 'bestaudio/best', | |
| 'postprocessors': [{ | |
| 'key': 'FFmpegExtractAudio', | |
| 'preferredcodec': 'mp3', | |
| 'preferredquality': '192', | |
| }], | |
| 'outtmpl': '%(id)s.%(ext)s', | |
| } | |
| try: | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| info_dict = ydl.extract_info(url, download=True) | |
| audio_file = f"{info_dict['id']}.mp3" | |
| logger.info(f"Video successfully downloaded: {audio_file}") | |
| return audio_file | |
| except Exception as e: | |
| logger.error(f"Error downloading video: {str(e)}") | |
| raise | |
| def convert_video_to_audio(video_file): | |
| """Converts a video file to audio.""" | |
| try: | |
| video = VideoFileClip(video_file) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file: | |
| video.audio.write_audiofile(temp_file.name) | |
| logger.info(f"Video converted to audio: {temp_file.name}") | |
| return temp_file.name | |
| except Exception as e: | |
| logger.error(f"Error converting video to audio: {str(e)}") | |
| raise | |
| def preprocess_audio(audio_file): | |
| """Preprocesses the audio file to improve quality.""" | |
| try: | |
| audio = AudioSegment.from_file(audio_file) | |
| audio = audio.apply_gain(-audio.dBFS + (-20)) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file: | |
| audio.export(temp_file.name, format="mp3") | |
| logger.info(f"Audio preprocessed: {temp_file.name}") | |
| return temp_file.name | |
| except Exception as e: | |
| logger.error(f"Error preprocessing audio file: {str(e)}") | |
| raise | |
| def transcribe_audio(file): | |
| """Transcribes an audio or video file.""" | |
| try: | |
| if isinstance(file, str) and file.startswith('http'): | |
| logger.info(f"Downloading social media video: {file}") | |
| file_path = download_social_media_video(file) | |
| elif isinstance(file, str) and file.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')): | |
| logger.info(f"Converting local video to audio: {file}") | |
| file_path = convert_video_to_audio(file) | |
| else: | |
| logger.info(f"Preprocessing audio file: {file}") | |
| file_path = preprocess_audio(file) | |
| logger.info(f"Transcribing audio: {file_path}") | |
| result = model.transcribe(file_path) | |
| transcription = result.get("text", "Error in transcription") | |
| logger.info(f"Transcription completed: {transcription[:50]}...") | |
| return transcription | |
| except Exception as e: | |
| logger.error(f"Error processing file: {str(e)}") | |
| return f"Error processing file: {str(e)}" | |
| def read_document(document_path): | |
| """Reads content from PDF, DOCX, XLSX or CSV documents.""" | |
| try: | |
| if document_path.endswith(".pdf"): | |
| doc = fitz.open(document_path) | |
| return "\n".join([page.get_text() for page in doc]) | |
| elif document_path.endswith(".docx"): | |
| doc = docx.Document(document_path) | |
| return "\n".join([paragraph.text for paragraph in doc.paragraphs]) | |
| elif document_path.endswith(".xlsx"): | |
| return pd.read_excel(document_path).to_string() | |
| elif document_path.endswith(".csv"): | |
| return pd.read_csv(document_path).to_string() | |
| else: | |
| return "Unsupported file type. Please upload a PDF, DOCX, XLSX or CSV document." | |
| except Exception as e: | |
| return f"Error reading document: {str(e)}" | |
| def read_url(url): | |
| """Reads content from a URL.""" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| return soup.get_text() | |
| except Exception as e: | |
| return f"Error reading URL: {str(e)}" | |
| def process_social_content(url): | |
| """Processes content from a social media URL, handling both text and video.""" | |
| try: | |
| # First, try to read content as text | |
| text_content = read_url(url) | |
| # Then, try to process as video | |
| try: | |
| video_content = transcribe_audio(url) | |
| except Exception: | |
| video_content = None | |
| return { | |
| "text": text_content, | |
| "video": video_content | |
| } | |
| except Exception as e: | |
| logger.error(f"Error processing social content: {str(e)}") | |
| return None | |
| def generate_news(instructions, facts, size, tone, *args): | |
| """Generates a news article from instructions, facts, URLs, documents, transcriptions, and social media content.""" | |
| knowledge_base = { | |
| "instructions": instructions, | |
| "facts": facts, | |
| "document_content": [], | |
| "audio_data": [], | |
| "url_content": [], | |
| "social_content": [] | |
| } | |
| num_audios = 5 * 3 # 5 audios/videos * 3 fields (file, name, position) | |
| num_social_urls = 3 * 3 # 3 social media URLs * 3 fields (URL, name, context) | |
| num_urls = 5 # 5 general URLs | |
| audios = args[:num_audios] | |
| social_urls = args[num_audios:num_audios+num_social_urls] | |
| urls = args[num_audios+num_social_urls:num_audios+num_social_urls+num_urls] | |
| documents = args[num_audios+num_social_urls+num_urls:] | |
| for url in urls: | |
| if url: | |
| knowledge_base["url_content"].append(read_url(url)) | |
| for document in documents: | |
| if document is not None: | |
| knowledge_base["document_content"].append(read_document(document.name)) | |
| for i in range(0, len(audios), 3): | |
| audio_file, name, position = audios[i:i+3] | |
| if audio_file is not None: | |
| knowledge_base["audio_data"].append({"audio": audio_file, "name": name, "position": position}) | |
| for i in range(0, len(social_urls), 3): | |
| social_url, social_name, social_context = social_urls[i:i+3] | |
| if social_url: | |
| social_content = process_social_content(social_url) | |
| if social_content: | |
| knowledge_base["social_content"].append({ | |
| "url": social_url, | |
| "name": social_name, | |
| "context": social_context, | |
| "text": social_content["text"], | |
| "video": social_content["video"] | |
| }) | |
| logger.info(f"Social media content processed: {social_url}") | |
| transcriptions_text, raw_transcriptions = "", "" | |
| for idx, data in enumerate(knowledge_base["audio_data"]): | |
| if data["audio"] is not None: | |
| transcription = transcribe_audio(data["audio"]) | |
| transcription_text = f'"{transcription}" - {data["name"]}, {data["position"]}' | |
| raw_transcription = f'[Audio/Video {idx + 1}]: "{transcription}" - {data["name"]}, {data["position"]}' | |
| transcriptions_text += transcription_text + "\n" | |
| raw_transcriptions += raw_transcription + "\n\n" | |
| for data in knowledge_base["social_content"]: | |
| if data["text"]: | |
| transcription_text = f'[Social media text]: "{data["text"][:200]}..." - {data["name"]}, {data["context"]}' | |
| transcriptions_text += transcription_text + "\n" | |
| raw_transcriptions += transcription_text + "\n\n" | |
| if data["video"]: | |
| transcription_video = f'[Social media video]: "{data["video"]}" - {data["name"]}, {data["context"]}' | |
| transcriptions_text += transcription_video + "\n" | |
| raw_transcriptions += transcription_video + "\n\n" | |
| document_content = "\n\n".join(knowledge_base["document_content"]) | |
| url_content = "\n\n".join(knowledge_base["url_content"]) | |
| internal_prompt = """ | |
| Instructions for the model: | |
| - Follow news article principles: answer the 5 Ws in the first paragraph (Who?, What?, When?, Where?, Why?). | |
| - Ensure at least 80% of quotes are direct and in quotation marks. | |
| - The remaining 20% can be indirect quotes. | |
| - Don't invent new information. | |
| - Be rigorous with provided facts. | |
| - When processing uploaded documents, extract and highlight important quotes and testimonials from sources. | |
| - When processing uploaded documents, extract and highlight key figures. | |
| - Avoid using the date at the beginning of the news body. Start directly with the 5Ws. | |
| - Include social media content relevantly, citing the source and providing proper context. | |
| - Make sure to relate the provided context for social media content with its corresponding transcription or text. | |
| """ | |
| prompt = f""" | |
| {internal_prompt} | |
| Write a news article with the following information, including a title, a 15-word hook (additional information that complements the title), and the content body with {size} words. The tone should be {tone}. | |
| Instructions: {knowledge_base["instructions"]} | |
| Facts: {knowledge_base["facts"]} | |
| Additional content from documents: {document_content} | |
| Additional content from URLs: {url_content} | |
| Use the following transcriptions as direct and indirect quotes (without changing or inventing content): | |
| {transcriptions_text} | |
| """ | |
| try: | |
| response = openai.ChatCompletion.create( | |
| model="gpt-4o-mini", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.1 | |
| ) | |
| news = response['choices'][0]['message']['content'] | |
| return news, raw_transcriptions | |
| except Exception as e: | |
| logger.error(f"Error generating news article: {str(e)}") | |
| return f"Error generating news article: {str(e)}", "" | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## All-in-One News Generator") | |
| # Add tool description and attribution | |
| gr.Markdown(""" | |
| ### About this tool | |
| This AI-powered news generator helps journalists and content creators produce news articles by processing multiple types of input: | |
| - Audio and video files with automatic transcription | |
| - Social media content | |
| - Documents (PDF, DOCX, XLSX, CSV) | |
| - Web URLs | |
| The tool uses advanced AI to generate well-structured news articles following journalistic principles and maintaining the integrity of source quotes. | |
| Created by [Camilo Vega](https://www.linkedin.com/in/camilo-vega-169084b1/), AI Consultant | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| instructions = gr.Textbox(label="News article instructions", lines=2) | |
| facts = gr.Textbox(label="Describe the news facts", lines=4) | |
| size = gr.Number(label="Content body size (in words)", value=100) | |
| tone = gr.Dropdown(label="News tone", choices=["serious", "neutral", "lighthearted"], value="neutral") | |
| with gr.Column(scale=3): | |
| inputs_list = [instructions, facts, size, tone] | |
| with gr.Tabs(): | |
| for i in range(1, 6): | |
| with gr.TabItem(f"Audio/Video {i}"): | |
| file = gr.File(label=f"Audio/Video {i}", type="filepath", file_types=["audio", "video"]) | |
| name = gr.Textbox(label="Name", scale=1) | |
| position = gr.Textbox(label="Position", scale=1) | |
| inputs_list.extend([file, name, position]) | |
| for i in range(1, 4): | |
| with gr.TabItem(f"Social Media {i}"): | |
| social_url = gr.Textbox(label=f"Social media URL {i}", lines=1) | |
| social_name = gr.Textbox(label=f"Person/account name {i}", scale=1) | |
| social_context = gr.Textbox(label=f"Content context {i}", lines=2) | |
| inputs_list.extend([social_url, social_name, social_context]) | |
| for i in range(1, 6): | |
| with gr.TabItem(f"URL {i}"): | |
| url = gr.Textbox(label=f"URL {i}", lines=1) | |
| inputs_list.append(url) | |
| for i in range(1, 6): | |
| with gr.TabItem(f"Document {i}"): | |
| document = gr.File(label=f"Document {i}", type="filepath", file_count="single") | |
| inputs_list.append(document) | |
| gr.Markdown("---") # Visual separator | |
| with gr.Row(): | |
| transcriptions_output = gr.Textbox(label="Transcriptions", lines=10) | |
| gr.Markdown("---") # Visual separator | |
| with gr.Row(): | |
| generate = gr.Button("Generate Draft") | |
| with gr.Row(): | |
| news_output = gr.Textbox(label="Generated Draft", lines=20) | |
| generate.click(fn=generate_news, inputs=inputs_list, outputs=[news_output, transcriptions_output]) | |
| # Add description about how to use the app | |
| gr.Markdown(""" | |
| ### How to Use This App | |
| 1. **Input your requirements:** | |
| - Enter your news article instructions | |
| - Describe the key facts of your news story | |
| - Set the desired word count and tone | |
| 2. **Add your sources:** | |
| - Upload audio/video files for automatic transcription | |
| - Add social media URLs to extract content | |
| - Include web URLs for additional information | |
| - Upload documents (PDF, DOCX, XLSX, CSV) to extract relevant data | |
| 3. **Generate your draft:** | |
| - Click "Generate Draft" to create your news article | |
| - Review the transcriptions to verify source accuracy | |
| - Use the generated draft as a starting point for your news story | |
| This tool helps streamline the news writing process by automatically gathering, organizing, and synthesizing information from multiple sources into a cohesive article that follows journalistic best practices. | |
| Created by [Camilo Vega](https://www.linkedin.com/in/camilo-vega-169084b1/), AI Consultant | |
| """) | |
| demo.launch(share=True) |