career_conversations / Task extract /process_meeting_notes.py
dimostchv's picture
Upload folder using huggingface_hub
b4a8f90 verified
#!/usr/bin/env python3
"""
Meeting Notes Processor
----------------------
This script processes meeting transcript files and creates structured notes in Notion.
It uses OpenAI's API to analyze the transcripts and extract key information.
"""
import os
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any, Union, TypedDict
from dotenv import load_dotenv
from openai import OpenAI
from notion_client import Client
from openai.types.chat import ChatCompletion
from openai.types.chat.chat_completion_message import ChatCompletionMessage
from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
from openai.types.chat.chat_completion_system_message_param import ChatCompletionSystemMessageParam
from openai.types.chat.chat_completion_user_message_param import ChatCompletionUserMessageParam
from openai.types.chat.chat_completion_assistant_message_param import ChatCompletionAssistantMessageParam
# Load environment variables and initialize clients
load_dotenv(override=True)
openai = OpenAI()
notion = Client(auth=os.getenv("NOTION_ACCESS_TOKEN"))
# Constants
DATABASE_ID = "214cfc87-3516-801f-9cf5-f6709213c7a0"
class HistoryMessage(TypedDict):
content: str
def get_transcript_files() -> List[Path]:
"""Get all transcript files in the transcripts directory."""
# Get the script's directory
script_dir = Path(__file__).parent
transcript_dir = script_dir / "transcripts"
processed_dir = script_dir / "processed"
# Create necessary directories if they don't exist
transcript_dir.mkdir(exist_ok=True)
processed_dir.mkdir(exist_ok=True)
# Get all txt files
return list(transcript_dir.glob("*.txt"))
def read_transcript(file_path: Path) -> str:
"""Read a transcript file."""
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
def move_to_processed(file_path: Path) -> None:
"""Move a processed transcript to the processed directory."""
script_dir = Path(__file__).parent
processed_dir = script_dir / "processed"
# Create processed directory if it doesn't exist
processed_dir.mkdir(exist_ok=True)
# Generate timestamp for unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
new_filename = f"{file_path.stem}_{timestamp}{file_path.suffix}"
new_path = processed_dir / new_filename
# Move the file
file_path.rename(new_path)
print(f"βœ… Moved transcript to {new_path}")
def get_system_prompt(transcript: str) -> str:
"""Generate the system prompt for the AI with the given transcript."""
return f"""You are a detailed notes processor. You are given a transcript of a meeting and you need to process the notes into a comprehensive, structured JSON format.
Please analyze the transcript thoroughly and return a JSON object with the following structure:
{{
"meeting_title": "Descriptive title capturing the main purpose of the meeting",
"participants": "Comma-separated list of attendees, first name only, use first name and first letter of surname when duplicate i.e. Dimo S",
"category": "One of: Coaching, dsm-firmenich, PakTech, BDB Internal, Other - select the most appropriate category based on the discussion context",
"summary": "A comprehensive 2-3 sentence summary covering the main topics discussed, key decisions made, and overall meeting outcome",
"content": {{
"Key Discussions": [
"Detailed points of discussion, including context and background information",
"Capture all important details, examples, and explanations provided",
"Include technical details, numbers, and specific references when mentioned"
],
"Decisions Made": [
"List all decisions made during the meeting",
"Include the context and reasoning behind each decision",
"Note any conditions or dependencies for the decisions"
],
"Challenges & Concerns": [
"Document any challenges, risks, or concerns raised",
"Include proposed solutions or mitigation strategies discussed",
"Note any unresolved issues that need follow-up"
],
"Next Steps": [
"List strategic next steps discussed",
"Include any dependencies or prerequisites mentioned",
"Note any timeline considerations"
]
}},
"action_items": [
{{
"task": "Specific, actionable task description",
"assignee": "Person assigned (or 'Unassigned')",
"deadline": "Deadline if mentioned (or 'Not specified')",
"dependencies": "Any dependencies or prerequisites mentioned",
"priority": "High/Medium/Low if indicated (or 'Not specified')"
}}
],
"meeting_url": "Meeting URL if mentioned (or null)",
"date": "IMPORTANT: Extract the actual meeting date from the transcript. Look for date references like 'scheduled for', 'meeting on', etc. Return in YYYY-MM-DD format. If multiple dates are mentioned, use the actual meeting date, not future dates mentioned for tasks. If no date is found, return null.",
"follow_up_items": [
"List of topics that need follow-up in future meetings",
"Include any parking lot items or tabled discussions"
]
}}
Guidelines:
- Be thorough and detailed in capturing all discussion points
- Maintain chronological order within each section when relevant
- Include specific examples, numbers, and technical details mentioned
- Capture the context and reasoning behind decisions and action items
- Note any disagreements or alternative viewpoints expressed
- Include any resource links or references mentioned
- Document any blockers, dependencies, and risks discussed
- Capture any parking lot items or topics deferred for future discussion
- Pay special attention to extracting the correct meeting date from the transcript
The meeting transcript is:
{transcript}
Return only the JSON object, no additional text."""
def chat(message: str, history: List[HistoryMessage], transcript: str) -> str:
"""Send a chat message to OpenAI API and get the response."""
system_prompt = get_system_prompt(transcript)
# Create properly typed messages
system_message: ChatCompletionSystemMessageParam = {
"role": "system",
"content": system_prompt
}
history_messages: List[Union[ChatCompletionUserMessageParam, ChatCompletionAssistantMessageParam]] = []
for i, msg in enumerate(history):
if i % 2 == 0:
history_messages.append({
"role": "user",
"content": msg["content"]
})
else:
history_messages.append({
"role": "assistant",
"content": msg["content"]
})
user_message: ChatCompletionUserMessageParam = {
"role": "user",
"content": message
}
messages: List[ChatCompletionMessageParam] = [
system_message,
*history_messages,
user_message
]
response: ChatCompletion = openai.chat.completions.create(
model="gpt-4o-mini",
messages=messages
)
if not response.choices or not response.choices[0].message or not response.choices[0].message.content:
raise ValueError("No response received from OpenAI API")
return response.choices[0].message.content
def create_meeting_note_with_formatting(
name: str,
category: str,
participants: str,
summary: str,
content_dict: Optional[Dict[str, Any]] = None,
action_items: Optional[List[str]] = None,
meeting_url: Optional[str] = None,
date: Optional[str] = None
) -> Optional[Dict[str, Any]]:
"""Create a meeting note with properly formatted content blocks."""
# Build properties based on your database schema
properties = {
"Name": {
"title": [{"text": {"content": name}}]
},
"Category": {
"select": {"name": category}
},
"Participants": {
"rich_text": [{"text": {"content": participants}}]
},
"Summary": {
"rich_text": [{"text": {"content": summary}}]
}
}
# Only add date if it was found in the transcript
if date:
properties["Date"] = {
"date": {"start": date}
}
# Add meeting URL if provided
if meeting_url:
properties["Meeting URL"] = {"url": meeting_url}
# Build the page content with proper formatting
children = []
# Add Meeting Notes header
children.append({
"object": "block",
"type": "heading_2",
"heading_2": {
"rich_text": [{"type": "text", "text": {"content": "Meeting Notes"}}]
}
})
# Add formatted content sections
if content_dict and isinstance(content_dict, dict):
for section, items in content_dict.items():
# Add section heading
children.append({
"object": "block",
"type": "heading_3",
"heading_3": {
"rich_text": [{"type": "text", "text": {"content": section}}]
}
})
# Add bulleted list items
if isinstance(items, list):
for item in items:
children.append({
"object": "block",
"type": "bulleted_list_item",
"bulleted_list_item": {
"rich_text": [{"type": "text", "text": {"content": item}}]
}
})
else:
children.append({
"object": "block",
"type": "bulleted_list_item",
"bulleted_list_item": {
"rich_text": [{"type": "text", "text": {"content": str(items)}}]
}
})
# Add action items if provided
if action_items:
children.append({
"object": "block",
"type": "heading_3",
"heading_3": {
"rich_text": [{"type": "text", "text": {"content": "Action Items"}}]
}
})
for item in action_items:
children.append({
"object": "block",
"type": "to_do",
"to_do": {
"rich_text": [{"type": "text", "text": {"content": item}}],
"checked": False
}
})
try:
# Create the page with formatted content
new_page = notion.pages.create(
parent={"database_id": DATABASE_ID},
properties=properties,
children=children
)
if isinstance(new_page, dict):
print(f"βœ… Meeting note '{name}' created successfully!")
print(f"Page URL: {new_page.get('url', 'URL not found')}")
return new_page
else:
print(f"❌ Error: Unexpected response type from Notion API")
return None
except Exception as e:
print(f"❌ Error creating meeting note: {e}")
return None
def process_ai_response_to_notion_formatted(ai_response: str) -> Optional[Dict[str, Any]]:
"""Process AI JSON response and create properly formatted Notion note."""
try:
# Parse the JSON response from AI
if isinstance(ai_response, str):
meeting_data = json.loads(ai_response)
else:
meeting_data = ai_response
# Extract action items in the correct format
formatted_action_items = []
if meeting_data.get("action_items"):
for item in meeting_data["action_items"]:
if isinstance(item, dict):
# Format the action item text
action_text = f"{item['task']} - "
if item['assignee'] != "Unassigned":
action_text += f"Assigned to: {item['assignee']}, "
if item['deadline'] != "Not specified":
action_text += f"Due: {item['deadline']}, "
if item['priority'] != "Not specified":
action_text += f"Priority: {item['priority']}, "
if item['dependencies'] != "Not specified":
action_text += f"Dependencies: {item['dependencies']}"
formatted_action_items.append(action_text.rstrip(", "))
else:
formatted_action_items.append(item)
# Add follow-up items to content if they exist
content_dict = meeting_data.get("content", {})
if meeting_data.get("follow_up_items"):
content_dict["Follow-up Items"] = meeting_data["follow_up_items"]
# Create the Notion note with formatted content
meeting_note = create_meeting_note_with_formatting(
name=meeting_data["meeting_title"],
category=meeting_data["category"],
participants=meeting_data["participants"],
summary=meeting_data["summary"],
content_dict=content_dict,
action_items=formatted_action_items,
meeting_url=meeting_data.get("meeting_url"),
date=meeting_data.get("date")
)
return meeting_note
except json.JSONDecodeError as e:
print(f"❌ Error parsing AI response as JSON: {e}")
return None
except Exception as e:
print(f"❌ Error processing AI response: {e}")
return None
def process_all_transcripts() -> None:
"""Process all transcript files in the transcripts directory."""
transcript_files = get_transcript_files()
if not transcript_files:
script_dir = Path(__file__).parent
transcript_dir = script_dir / "transcripts"
print(f"\n❌ No transcript files found in {transcript_dir}")
print("Please place your .txt transcript files in the 'transcripts' directory")
return
print(f"\nFound {len(transcript_files)} transcript files to process in the transcripts directory.")
for transcript_file in transcript_files:
print(f"\nProcessing {transcript_file.name}...")
try:
# Read the transcript
transcript = read_transcript(transcript_file)
# Get AI notes for this transcript
notes = chat("What are the notes?", [], transcript)
# Process the notes and create Notion page
meeting_note = process_ai_response_to_notion_formatted(notes)
if meeting_note:
print(f"βœ… Successfully processed {transcript_file.name}")
# Move the file to processed directory
move_to_processed(transcript_file)
else:
print(f"❌ Failed to process {transcript_file.name}")
except Exception as e:
print(f"❌ Error processing {transcript_file.name}: {e}")
def main():
"""Main entry point of the script."""
# Check for required environment variables
if not os.getenv("OPENAI_API_KEY"):
print("❌ Error: OPENAI_API_KEY environment variable is not set")
return
if not os.getenv("NOTION_ACCESS_TOKEN"):
print("❌ Error: NOTION_ACCESS_TOKEN environment variable is not set")
return
process_all_transcripts()
if __name__ == "__main__":
main()