Spaces:
Configuration error
Configuration error
| from unstructured.partition.pdf import partition_pdf | |
| from langchain_openai import ChatOpenAI | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_core.output_parsers import StrOutputParser | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| def get_images_base64(chunks): | |
| images_b64 = [] | |
| for chunk in chunks: | |
| if "CompositeElement" in str(type(chunk)): | |
| chunk_els = chunk.metadata.orig_elements | |
| for el in chunk_els: | |
| if "Image" in str(type(el)): | |
| images_b64.append(el.metadata.image_base64) | |
| return images_b64 | |
| def LoadAndExtractData(file_path): | |
| try: | |
| # separate tables from texts | |
| tables = [] | |
| texts = [] | |
| print(">> Extracting Data") | |
| data = partition_pdf( | |
| filename=file_path, | |
| infer_table_structure=True, # extract tables | |
| # strategy="hi_res", # mandatory to infer tables | |
| extract_image_block_types=["Image"], # Add 'Tabl | |
| extract_image_block_to_payload=True, # if true, will extract base64 for API usage | |
| chunking_strategy="by_title", # or 'basic' | |
| max_characters=10000, # defaults to 500 | |
| combine_text_under_n_chars=2000, # defaults to 0 | |
| new_after_n_chars=6000, | |
| # extract_images_in_pdf=True, # deprecated | |
| ) | |
| # Extract the tables and text | |
| print(">> Extracting Text and tables...") | |
| for chunk in data: | |
| if "Table" in str(type(chunk)): | |
| tables.append(chunk) | |
| if "CompositeElement" in str(type((chunk))): | |
| texts.append(chunk) | |
| print(">> Chunks are: ",data) | |
| # extract the image | |
| print(">> Extracting Images...") | |
| images = get_images_base64(data) | |
| return tables ,texts, images | |
| except Exception as e: | |
| print("Error is: ",str(e)) | |
| return [], [], str(e) | |
| # Summarizer Function | |
| def Summarizer(prompt_template, data, config=True, set_messages=False): | |
| """ | |
| This function summarizes documents using a prompt template and the ChatOpenAI model. | |
| Args: | |
| prompt_template (str): Template string for the prompt. | |
| data (List[Dict] or List[str]): Input data to be summarized. | |
| config (bool): Whether to run the chain with concurrency limit. | |
| set_messages (bool): Whether to set messages as chat messages with an image. | |
| Returns: | |
| List[str]: List of summaries. | |
| """ | |
| try: | |
| # api_key = os.getenv() | |
| if set_messages: | |
| messages = [ | |
| ( | |
| "user", | |
| [ | |
| {"type": "text", "text": prompt_template}, | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": "data:image/jpeg;base64,{image}"}, | |
| }, | |
| ], | |
| ) | |
| ] | |
| prompt = ChatPromptTemplate.from_messages(messages) | |
| model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini") | |
| summarize_chain = {"image": lambda x: x} | prompt | model | StrOutputParser() | |
| else: | |
| prompt = ChatPromptTemplate.from_template(prompt_template) | |
| model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini") | |
| summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser() | |
| if config: | |
| return summarize_chain.batch(data, {"max_concurrency": 3}) | |
| else: | |
| return summarize_chain.batch(data) | |
| except Exception as e: | |
| return str(e) | |