import openai import gradio as gr import os from pdf2image import convert_from_path import pytesseract from PIL import Image import glob # Access the OpenAI API key from environment variables (Hugging Face secret) openai.api_key = os.getenv('OPENAI_API_KEY') # Directory where the PDF files are stored pdf_directory = '/path_to_pdf_files' # Change this to your actual dataset path def pdf_to_text(pdf_path): """ Converts PDF pages to images and extracts text using OCR. """ images = convert_from_path(pdf_path) full_text = "" for image in images: # Perform OCR on each image text = pytesseract.image_to_string(image) full_text += text + "\n" return full_text def extract_info(query): """ This function interacts with OpenAI GPT-3.5 Turbo to extract information from the dataset based on the user's query. """ all_texts = [] # Loop through all PDF files in the directory for pdf_path in glob.glob(f'{pdf_directory}/*.pdf'): pdf_text = pdf_to_text(pdf_path) all_texts.append(pdf_text) combined_text = "\n".join(all_texts) # Send combined text and query to OpenAI for extraction prompt = f"Extract relevant information based on the following query: '{query}' from the Madras Music Academy Souvenir archives: {combined_text[:2000]}" response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are an assistant that extracts information from PDF files using OCR."}, {"role": "user", "content": prompt} ], max_tokens=300 ) # Return the answer from OpenAI GPT-3.5 answer = response['choices'][0]['message']['content'] return answer.strip() # Define the Gradio interface def gradio_interface(query): return extract_info(query) # Launch the Gradio app iface = gr.Interface( fn=gradio_interface, inputs="text", outputs="text", title="Sabha Scholar - Madras Music Academy AI Explorer", description="Ask questions about the Madras Music Academy Souvenirs." ) iface.launch()