Spaces:

darmendarizp
/

transcript-summarizer

Build error

App Files Files Community

transcript-summarizer / transcript_transformer.py

darmendarizp

Upload folder using huggingface_hub

8bc1706 verified about 1 year ago

raw

history blame contribute delete

4.58 kB

	import openai
	from dotenv import load_dotenv

	load_dotenv()


	class TranscriptTransformer:
	def __init__(self, model="gpt-4o-mini", max_tokens=128000):
	self.model = model
	self.max_model_tokens = max_tokens
	self.final_response_system_prompt = """You are an expert educational content creator.
	Your task is to transform informal transcripts into structured, engaging teaching materials.
	Focus on clarity, logical flow, and educational value."""

	self.summarizer_system_prompt = """You are an expert educational content creator.
	Your task is to summarize the following transcript chunk.
	Ensure that the summary is concise and captures the main points.
	"""

	self.final_response_user_prompt = """Transform the following transcript into a structured {duration}-minute lecture.

	Requirements:
	1. Create a clear introduction that sets context and learning objectives
	2. Organize the content into logical sections with clear headings
	3. Include practical examples and real-world applications
	4. Add discussion questions or interactive elements
	5. Conclude with a summary and key takeaways
	6. Target approximately {word_count} words

	Format the output in markdown with clear section headers and proper spacing.
	"""

	self.summarizer_user_prompt = """
	{context}
	Summarize the following transcript chunk:
	{chunk}
	"""

	def split_text_into_chunks(self, text: str) -> list[str]:
	"""Split the text into chunks that fit within the token limit."""
	words = text.split()
	chunks = []
	current_chunk = []

	for word in words:
	current_chunk.append(word)
	if len(" ".join(current_chunk)) >= self.max_model_tokens:
	chunks.append(" ".join(current_chunk))
	current_chunk = []

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	return chunks

	def generate_transcript_chunk_summary(
	self, chunk: str, max_output_tokens: int, previous_summary=None
	) -> str:
	"""Summarize the current chunk with the context of the previous chunk."""

	context = (
	f"Summary of previous transcript chunk:\n{previous_summary}\n\n"
	if previous_summary
	else ""
	)

	response = openai.chat.completions.create(
	model=self.model,
	messages=[
	{"role": "system", "content": self.summarizer_system_prompt},
	{"role": "user", "content": self.summarizer_system_prompt.format(context=context, chunk=chunk)},
	],
	max_tokens=max_output_tokens,
	temperature=0.7,
	)
	generated_text = response.choices[0].message.content or chunk
	return generated_text

	def summarize_text(self, transcript: str, max_output_tokens: int) -> str:
	"""Process a large transcript by splitting it into chunks and combining results."""
	chunks = self.split_text_into_chunks(transcript)

	if len(chunks) == 1:
	return chunks[0]

	summarized_transcript = []
	previous_summary = None
	for chunk in chunks:
	teaching_transcript = self.generate_transcript_chunk_summary(
	chunk, max_output_tokens, previous_summary
	)
	summarized_transcript.append(teaching_transcript)
	previous_summary = teaching_transcript
	return "\n\n".join(summarized_transcript)

	def generate_lecture(self, raw_text: str, lecture_duration: int = 30) -> str:
	# An average paced lecturer speaks 130 words per minute
	max_output_tokens = lecture_duration * 130
	summarized_transcript = self.summarize_text(
	raw_text, max_output_tokens=max_output_tokens
	)
	full_text = f"{self.final_response_user_prompt.format(duration=lecture_duration, word_count=max_output_tokens)}\n\nTranscript:\n{summarized_transcript}"
	final_response = openai.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": self.final_response_system_prompt},
	{"role": "user", "content": full_text},
	],
	max_tokens=max_output_tokens,
	temperature=0.7,
	)
	return (
	final_response.choices[0].message.content
	or "Error: No response from the model."
	)