Spaces:
Sleeping
Sleeping
| import openai | |
| import gradio as gr | |
| import os | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| from PIL import Image | |
| import glob | |
| # Access the OpenAI API key from environment variables (Hugging Face secret) | |
| openai.api_key = os.getenv('OPENAI_API_KEY') | |
| # Directory where the PDF files are stored | |
| pdf_directory = '/path_to_pdf_files' # Change this to your actual dataset path | |
| def pdf_to_text(pdf_path): | |
| """ | |
| Converts PDF pages to images and extracts text using OCR. | |
| """ | |
| images = convert_from_path(pdf_path) | |
| full_text = "" | |
| for image in images: | |
| # Perform OCR on each image | |
| text = pytesseract.image_to_string(image) | |
| full_text += text + "\n" | |
| return full_text | |
| def extract_info(query): | |
| """ | |
| This function interacts with OpenAI GPT-3.5 Turbo to extract information from the dataset based on the user's query. | |
| """ | |
| all_texts = [] | |
| # Loop through all PDF files in the directory | |
| for pdf_path in glob.glob(f'{pdf_directory}/*.pdf'): | |
| pdf_text = pdf_to_text(pdf_path) | |
| all_texts.append(pdf_text) | |
| combined_text = "\n".join(all_texts) | |
| # Send combined text and query to OpenAI for extraction | |
| prompt = f"Extract relevant information based on the following query: '{query}' from the Madras Music Academy Souvenir archives: {combined_text[:2000]}" | |
| response = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "system", "content": "You are an assistant that extracts information from PDF files using OCR."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=300 | |
| ) | |
| # Return the answer from OpenAI GPT-3.5 | |
| answer = response['choices'][0]['message']['content'] | |
| return answer.strip() | |
| # Define the Gradio interface | |
| def gradio_interface(query): | |
| return extract_info(query) | |
| # Launch the Gradio app | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs="text", | |
| outputs="text", | |
| title="Sabha Scholar - Madras Music Academy AI Explorer", | |
| description="Ask questions about the Madras Music Academy Souvenirs." | |
| ) | |
| iface.launch() | |