Spaces:
Runtime error
Runtime error
| from io import BytesIO | |
| import os | |
| import base64 | |
| import fitz | |
| from fastapi.responses import JSONResponse | |
| from llama_index.core.vector_stores import ( | |
| MetadataFilter, | |
| MetadataFilters, | |
| FilterCondition, | |
| ) | |
| from llama_index.core import load_index_from_storage | |
| from llama_index.core.storage import StorageContext | |
| from llama_index.llms.openai import OpenAI | |
| from core.parser import parse_topics_to_dict | |
| from llama_index.core.llms import ChatMessage | |
| from core.prompt import ( | |
| SYSTEM_TOPIC_TEMPLATE, | |
| USER_TOPIC_TEMPLATE, | |
| REFINED_GET_TOPIC_TEMPLATE, | |
| ) | |
| # from langfuse.openai import openai | |
| class SummarizeGenerator: | |
| def __init__(self, references): | |
| self.references = references | |
| self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096) | |
| def extract_pages(self, content_table): | |
| try: | |
| content_bytes = content_table.file.read() | |
| print(content_bytes) | |
| # Open the PDF file | |
| content_table = fitz.open(stream=content_bytes, filetype="pdf") | |
| print(content_table) | |
| # content_table = fitz.open(topics_image) | |
| except Exception as e: | |
| return JSONResponse(status_code=400, content=f"Error opening PDF file: {e}") | |
| # Initialize a list to collect base64 encoded images | |
| pix_encoded_combined = [] | |
| # Iterate over each page to extract images | |
| for page_number in range(len(content_table)): | |
| try: | |
| page = content_table.load_page(page_number) | |
| pix_encoded = self._extract_image_as_base64(page) | |
| pix_encoded_combined.append(pix_encoded) | |
| # print("pix encoded combined", pix_encoded_combined) | |
| except Exception as e: | |
| print(f"Error processing page {page_number}: {e}") | |
| continue # Skip to the next page if there's an error | |
| if not pix_encoded_combined: | |
| return JSONResponse(status_code=404, content="No images found in the PDF") | |
| return pix_encoded_combined | |
| def extract_content_table(self, content_table): | |
| try: | |
| images = self.extract_pages(content_table) | |
| image_messages = [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{image}", | |
| }, | |
| } | |
| for image in images | |
| ] | |
| messages = [ | |
| ChatMessage( | |
| role="system", | |
| content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}], | |
| ), | |
| ChatMessage( | |
| role="user", | |
| content=[ | |
| {"type": "text", "text": USER_TOPIC_TEMPLATE}, | |
| *image_messages, | |
| ], | |
| ), | |
| ] | |
| extractor_output = self.llm.chat(messages) | |
| print("extractor output : ", extractor_output) | |
| refined_extractor_output = self.llm.complete( | |
| REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output)) | |
| ) | |
| print("refined extractor output : ",str(refined_extractor_output)) | |
| extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output))) | |
| return str(refined_extractor_output), extractor_dics | |
| except Exception as e: | |
| return JSONResponse(status_code=500, content=f"An error occurred: {e}") | |
| def _extract_image_as_base64(self, page): | |
| try: | |
| pix = page.get_pixmap() | |
| pix_bytes = pix.tobytes() | |
| return base64.b64encode(pix_bytes).decode("utf-8") | |
| except Exception as e: | |
| return JSONResponse(status_code=500, content=f"Error extracting image: {e}") | |
| def index_summarizer_engine(self, topic, subtopic, index): | |
| filters = MetadataFilters( | |
| filters=[ | |
| MetadataFilter(key="title", value=topic), | |
| MetadataFilter(key="category", value=subtopic), | |
| ], | |
| condition=FilterCondition.AND, | |
| ) | |
| # Create the QueryEngineTool with the index and filters | |
| kwargs = {"similarity_top_k": 5, "filters": filters} | |
| query_engine = index.as_query_engine(**kwargs) | |
| return query_engine | |
| def get_summarizer_engine(self, topic, subtopic): | |
| pass | |
| def prepare_summaries(self): | |
| pass |