Spaces:

Multimedika
/

Bot_Development

Runtime error

App Files Files Community

Bot_Development / core /summarization /summarizer.py

dsmultimedika

Improve the code bot development

d57efd6 over 1 year ago

raw

history blame contribute delete

4.5 kB

	from io import BytesIO
	import os
	import base64
	import fitz

	from fastapi.responses import JSONResponse
	from llama_index.core.vector_stores import (
	MetadataFilter,
	MetadataFilters,
	FilterCondition,
	)

	from llama_index.core import load_index_from_storage
	from llama_index.core.storage import StorageContext
	from llama_index.llms.openai import OpenAI
	from core.parser import parse_topics_to_dict
	from llama_index.core.llms import ChatMessage
	from core.prompt import (
	SYSTEM_TOPIC_TEMPLATE,
	USER_TOPIC_TEMPLATE,
	REFINED_GET_TOPIC_TEMPLATE,
	)

	# from langfuse.openai import openai


	class SummarizeGenerator:
	def __init__(self, references):

	self.references = references
	self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096)

	def extract_pages(self, content_table):
	try:
	content_bytes = content_table.file.read()
	print(content_bytes)
	# Open the PDF file
	content_table = fitz.open(stream=content_bytes, filetype="pdf")
	print(content_table)
	# content_table = fitz.open(topics_image)
	except Exception as e:
	return JSONResponse(status_code=400, content=f"Error opening PDF file: {e}")

	# Initialize a list to collect base64 encoded images
	pix_encoded_combined = []

	# Iterate over each page to extract images
	for page_number in range(len(content_table)):
	try:
	page = content_table.load_page(page_number)
	pix_encoded = self._extract_image_as_base64(page)
	pix_encoded_combined.append(pix_encoded)
	# print("pix encoded combined", pix_encoded_combined)

	except Exception as e:
	print(f"Error processing page {page_number}: {e}")
	continue # Skip to the next page if there's an error

	if not pix_encoded_combined:
	return JSONResponse(status_code=404, content="No images found in the PDF")

	return pix_encoded_combined

	def extract_content_table(self, content_table):
	try:
	images = self.extract_pages(content_table)

	image_messages = [
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{image}",
	},
	}
	for image in images
	]

	messages = [
	ChatMessage(
	role="system",
	content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}],
	),
	ChatMessage(
	role="user",
	content=[
	{"type": "text", "text": USER_TOPIC_TEMPLATE},
	*image_messages,
	],
	),
	]

	extractor_output = self.llm.chat(messages)
	print("extractor output : ", extractor_output)
	refined_extractor_output = self.llm.complete(
	REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output))
	)

	print("refined extractor output : ",str(refined_extractor_output))

	extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output)))

	return str(refined_extractor_output), extractor_dics

	except Exception as e:
	return JSONResponse(status_code=500, content=f"An error occurred: {e}")

	def _extract_image_as_base64(self, page):
	try:
	pix = page.get_pixmap()
	pix_bytes = pix.tobytes()
	return base64.b64encode(pix_bytes).decode("utf-8")
	except Exception as e:
	return JSONResponse(status_code=500, content=f"Error extracting image: {e}")

	def index_summarizer_engine(self, topic, subtopic, index):
	filters = MetadataFilters(
	filters=[
	MetadataFilter(key="title", value=topic),
	MetadataFilter(key="category", value=subtopic),
	],
	condition=FilterCondition.AND,
	)

	# Create the QueryEngineTool with the index and filters
	kwargs = {"similarity_top_k": 5, "filters": filters}

	query_engine = index.as_query_engine(**kwargs)

	return query_engine

	def get_summarizer_engine(self, topic, subtopic):
	pass

	def prepare_summaries(self):
	pass