MultiAgent-System-for-Screenplay-Creation

Runtime error

App Files Files Community

hugging2021 commited on 16 days ago

Commit

24164ce

verified ·

1 Parent(s): 413efae

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -65

app.py CHANGED Viewed

@@ -3,7 +3,8 @@ import os
 import base64
 import pandas as pd
 from PIL import Image
-from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, OpenAIServerModel, tool, Tool
 from typing import Optional
 import requests
 from io import BytesIO
@@ -21,14 +22,13 @@ from odf.opendocument import load as load_odt
 ## utilties and class definition
 def is_image_extension(filename: str) -> bool:
     IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.svg'}
-    ext = os.path.splitext(filename)[1].lower() # os.path.splitext(path) returns (root, ext)
     return ext in IMAGE_EXTS
 def load_file(path: str) -> dict:
     """Based on the file extension, load the file into a suitable object."""
     text = None
-    ext = Path(path).suffix.lower()  # same as os.path.splitext(filename)[1].lower()
     match ext:
         case '.jpg'| '.jpeg'| '.png'| '.gif'| '.bmp'| '.tiff'| '.webp'| '.svg':
@@ -36,31 +36,29 @@ def load_file(path: str) -> dict:
         case '.docx':
             text = docx2txt.process(path)
         case ".xlsx" | ".xls" :
-            text = pd.read_excel(path)  # DataFrame
             text = str(text).strip()
         case '.odt':
             text = load_odt(path)
             text = str(text.body).strip()
-            pass
         case ".csv":
-            text = pd.read_csv(path)  # DataFrame
             text = str(text).strip()
         case ".pdf":
             with pdfplumber.open(path) as pdf:
                 text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
         case '.py' | '.txt':
             with open(path, 'r') as f:
-                text = f.read()  # plain text str
         case '.mp3' | '.wav':
             return {"audio path": path}
-        case _: # default case
             text = None
     return {"raw document text": text, "file path": path}
 def check_format(answer: str | list, *args, **kwargs) -> list:
     """Check if the answer is a list and not a nested list."""
-    # other args are ignored on purpose, they are there just for compatibility
     print("Checking format of the answer:", answer)
     if isinstance(answer, list):
         for item in answer:
@@ -87,18 +85,14 @@ def download_images(image_urls: str) -> list:
     Returns:
         List of PIL.Image.Image objects wrapped by gr.Image
     """
-    urls = [u.strip() for u in image_urls.split(",") if u.strip()]  # strip() removes whitespaces
     images = []
-    for n_url, url in enumerate(urls, start=1):  # enumerate seems not needed... keeping it for now
         try:
-            # Fetch the image bytes
             resp = requests.get(url, timeout=10)
             resp.raise_for_status()
-            # Load into a PIL image
             img = Image.open(BytesIO(resp.content)).convert("RGB")
             images.append(img)
         except Exception as e:
             print(f"Failed to download from url {n_url} ({url}): {e}")
@@ -107,7 +101,7 @@ def download_images(image_urls: str) -> list:
         wrapped.append(gr.Image(value=img))
     return wrapped
-@tool # since they gave us OpenAI API credits, we can keep using it
 def transcribe_audio(audio_path: str) -> str:
     """
     Transcribe audio file using OpenAI Whisper API.
@@ -118,7 +112,7 @@ def transcribe_audio(audio_path: str) -> str:
     """
     try:
         client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
-        with open(audio_path, "rb") as audio:  # to modify path because it is arriving from gradio
             transcript = client.audio.transcriptions.create(
                 file=audio,
                 model="whisper-1",
@@ -174,7 +168,6 @@ def generate_audio(prompt: str, duration: int) -> gr.Component:
     Returns:
         gr.Component: The generated audio as a Gradio Audio component.
     """
     DURATION_LIMIT = 30
     duration = duration if duration < DURATION_LIMIT else DURATION_LIMIT
@@ -186,7 +179,6 @@ def generate_audio(prompt: str, duration: int) -> gr.Component:
     )
     sound = client(prompt, duration)
     return gr.Audio(value=sound)
@@ -201,7 +193,6 @@ def generate_audio_from_sample(prompt: str, duration: int, sample_path: str = No
     Returns:
         gr.Component: The generated audio as a Gradio Audio component.
     """
     DURATION_LIMIT = 30
     duration = duration if duration < DURATION_LIMIT else DURATION_LIMIT
@@ -213,7 +204,6 @@ def generate_audio_from_sample(prompt: str, duration: int, sample_path: str = No
     )
     sound = client(prompt, duration, sample_path)
     return gr.Audio(value=sound)
 @tool
@@ -226,9 +216,10 @@ def caption_image(img_path: str, prompt: str) -> str:
     Returns:
         str: A description of the image.
     """
-    client_2 = HfApiModel("google/gemma-3-27b-it",
-                          provider="nebius",
-                          api_key=os.getenv("NEBIUS_API_KEY"))
     with open(img_path, "rb") as f:
         encoded = base64.b64encode(f.read()).decode("utf-8")
@@ -251,15 +242,10 @@ def caption_image(img_path: str, prompt: str) -> str:
 ## agent definition
 class Agent:
-    def __init__(self, ):
-        #client = HfApiModel("deepseek-ai/DeepSeek-R1-0528", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
-        client = HfApiModel("Qwen/Qwen3-32B", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
-        """client = OpenAIServerModel(
-            model_id="claude-opus-4-20250514",
-            api_base="https://api.anthropic.com/v1/",
-            api_key=os.environ["ANTHROPIC_API_KEY"],
-        )"""
         self.agent = CodeAgent(
             model=client,
             tools=[DuckDuckGoSearchTool(max_results=5),
@@ -279,8 +265,6 @@ class Agent:
         with open("system_prompt.txt", "r") as f:
             system_prompt = f.read()
             self.agent.prompt_templates["system_prompt"] = system_prompt
-        #print("System prompt:", self.agent.prompt_templates["system_prompt"])
     def __call__(self, message: str,
                  images: Optional[list[Image.Image]] = None,
@@ -293,13 +277,12 @@ class Agent:
 ## gradio functions
 def respond(message: str, history : dict, web_search: bool = False):
     global agent
-    # input
     print("history:", history)
     text = message.get("text", "")
-    if not message.get("files") and not web_search: # no files uploaded
         print("No files received.")
-        message = agent(text + "\nADDITIONAL CONTRAINT: Don't use web search", conversation_history=history) # conversation_history is a dict with the history of the conversation
-    elif not message.get("files") and web_search: # no files uploaded
         print("No files received + web search enabled.")
         message = agent(text, conversation_history=history)
     else:
@@ -311,9 +294,7 @@ def respond(message: str, history : dict, web_search: bool = False):
             file = load_file(files[0])
             message = agent(text, files=file, conversation_history=history)
-    # output
     print("Agent response:", message)
     return message
 def initialize_agent():
@@ -322,28 +303,8 @@ def initialize_agent():
     return agent
 ## gradio interface
-description = textwrap.dedent("""**Scriptura** is a multi-agent AI framework based on HF-SmolAgents that streamlines the creation of screenplays, storyboards,
-and soundtracks by automating the stages of analysis, summarization, and multimodal enrichment, freeing authors to focus on pure creativity.
-At its heart:
-- **Qwen3-32B** serves as the primary orchestrating agent, coordinating workflows and managing high-level reasoning across the system.
-- **Gemma-3-27B-IT** acts as a specialized assistant for multimodal tasks, supporting both text and audio inputs to refine narrative elements and prepare them for downstream generation.
-For media generation, Scriptura integrates:
-- **MusicGen** models (per the AudioCraft MusicGen specification), deployed via Hugging Face Spaces,
-enabling the agent to produce original soundtracks and sound effects from text prompts or combined text + audio samples.
-- **FLUX (black-forest-labs/FLUX.1-dev)** for on-the-fly image creation, ideal for storyboards, concept art, and
-visual references that seamlessly tie into the narrative flow.
-Optionally, Scriptura can query external sources (e.g., via a DuckDuckGo API integration) to pull in reference scripts, sound samples, or research materials,
-ensuring that every draft is not only creatively rich but also contextually informed.
-To view the presentation **video**, click [here](https://www.youtube.com/watch?v=I0201ruB1Uo&ab_channel=3DLabFactory)
-For more information: [README.md](https://huggingface.co/spaces/Agents-MCP-Hackathon/MultiAgent_System_for_Screenplay_Creation/blob/main/README.md)
-**Important**: if you’re interested in trying the sound generation feature, please open a discussion to request that we restart our custom space. We have limited credits, so we appreciate your understanding 🤓
-""")
 # global agent
 agent = initialize_agent()
 demo = gr.ChatInterface(
@@ -359,7 +320,7 @@ demo = gr.ChatInterface(
                     autoscroll=True,
                     additional_inputs=[
                         gr.Checkbox(value=False, label="Web Search",
-                                info="Enable web search to find information online. If disabled, the agent will only use the provided files and images.",
                                 render=False),
                             ],
                     additional_inputs_accordion=gr.Accordion(label="Tools available: ", open=True, render=False)
@@ -367,4 +328,4 @@ demo = gr.ChatInterface(
 if __name__ == "__main__":
-    demo.launch()

 import base64
 import pandas as pd
 from PIL import Image
+# HfApiModel wurde in HfModel umbenannt
+from smolagents import CodeAgent, DuckDuckGoSearchTool, HfModel, VisitWebpageTool, OpenAIServerModel, tool, Tool
 from typing import Optional
 import requests
 from io import BytesIO
 ## utilties and class definition
 def is_image_extension(filename: str) -> bool:
     IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.svg'}
+    ext = os.path.splitext(filename)[1].lower()
     return ext in IMAGE_EXTS
 def load_file(path: str) -> dict:
     """Based on the file extension, load the file into a suitable object."""
     text = None
+    ext = Path(path).suffix.lower()
     match ext:
         case '.jpg'| '.jpeg'| '.png'| '.gif'| '.bmp'| '.tiff'| '.webp'| '.svg':
         case '.docx':
             text = docx2txt.process(path)
         case ".xlsx" | ".xls" :
+            text = pd.read_excel(path)
             text = str(text).strip()
         case '.odt':
             text = load_odt(path)
             text = str(text.body).strip()
         case ".csv":
+            text = pd.read_csv(path)
             text = str(text).strip()
         case ".pdf":
             with pdfplumber.open(path) as pdf:
                 text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
         case '.py' | '.txt':
             with open(path, 'r') as f:
+                text = f.read()
         case '.mp3' | '.wav':
             return {"audio path": path}
+        case _:
             text = None
     return {"raw document text": text, "file path": path}
 def check_format(answer: str | list, *args, **kwargs) -> list:
     """Check if the answer is a list and not a nested list."""
     print("Checking format of the answer:", answer)
     if isinstance(answer, list):
         for item in answer:
     Returns:
         List of PIL.Image.Image objects wrapped by gr.Image
     """
+    urls = [u.strip() for u in image_urls.split(",") if u.strip()]
     images = []
+    for n_url, url in enumerate(urls, start=1):
         try:
             resp = requests.get(url, timeout=10)
             resp.raise_for_status()
             img = Image.open(BytesIO(resp.content)).convert("RGB")
             images.append(img)
         except Exception as e:
             print(f"Failed to download from url {n_url} ({url}): {e}")
         wrapped.append(gr.Image(value=img))
     return wrapped
+@tool
 def transcribe_audio(audio_path: str) -> str:
     """
     Transcribe audio file using OpenAI Whisper API.
     """
     try:
         client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
+        with open(audio_path, "rb") as audio:
             transcript = client.audio.transcriptions.create(
                 file=audio,
                 model="whisper-1",
     Returns:
         gr.Component: The generated audio as a Gradio Audio component.
     """
     DURATION_LIMIT = 30
     duration = duration if duration < DURATION_LIMIT else DURATION_LIMIT
     )
     sound = client(prompt, duration)
     return gr.Audio(value=sound)
     Returns:
         gr.Component: The generated audio as a Gradio Audio component.
     """
     DURATION_LIMIT = 30
     duration = duration if duration < DURATION_LIMIT else DURATION_LIMIT
     )
     sound = client(prompt, duration, sample_path)
     return gr.Audio(value=sound)
 @tool
     Returns:
         str: A description of the image.
     """
+    # Korrektur: HfModel statt HfApiModel
+    client_2 = HfModel("google/gemma-3-27b-it",
+                       provider="nebius",
+                       api_key=os.getenv("NEBIUS_API_KEY"))
     with open(img_path, "rb") as f:
         encoded = base64.b64encode(f.read()).decode("utf-8")
 ## agent definition
 class Agent:
+    def __init__(self):
+        # Korrektur: HfModel statt HfApiModel
+        client = HfModel("Qwen/Qwen3-32B", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
         self.agent = CodeAgent(
             model=client,
             tools=[DuckDuckGoSearchTool(max_results=5),
         with open("system_prompt.txt", "r") as f:
             system_prompt = f.read()
             self.agent.prompt_templates["system_prompt"] = system_prompt
     def __call__(self, message: str,
                  images: Optional[list[Image.Image]] = None,
 ## gradio functions
 def respond(message: str, history : dict, web_search: bool = False):
     global agent
     print("history:", history)
     text = message.get("text", "")
+    if not message.get("files") and not web_search:
         print("No files received.")
+        message = agent(text + "\nADDITIONAL CONTRAINT: Don't use web search", conversation_history=history)
+    elif not message.get("files") and web_search:
         print("No files received + web search enabled.")
         message = agent(text, conversation_history=history)
     else:
             file = load_file(files[0])
             message = agent(text, files=file, conversation_history=history)
     print("Agent response:", message)
     return message
 def initialize_agent():
     return agent
 ## gradio interface
+description = textwrap.dedent("""**Scriptura** is a multi-agent AI framework...""")
 # global agent
 agent = initialize_agent()
 demo = gr.ChatInterface(
                     autoscroll=True,
                     additional_inputs=[
                         gr.Checkbox(value=False, label="Web Search",
+                                info="Enable web search to find information online.",
                                 render=False),
                             ],
                     additional_inputs_accordion=gr.Accordion(label="Tools available: ", open=True, render=False)
 if __name__ == "__main__":
+    demo.launch()