sabhascholar / app.py
dindizz's picture
Update app.py
4bb5912 verified
import openai
import gradio as gr
import os
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import glob
# Access the OpenAI API key from environment variables (Hugging Face secret)
openai.api_key = os.getenv('OPENAI_API_KEY')
# Directory where the PDF files are stored
pdf_directory = '/path_to_pdf_files' # Change this to your actual dataset path
def pdf_to_text(pdf_path):
"""
Converts PDF pages to images and extracts text using OCR.
"""
images = convert_from_path(pdf_path)
full_text = ""
for image in images:
# Perform OCR on each image
text = pytesseract.image_to_string(image)
full_text += text + "\n"
return full_text
def extract_info(query):
"""
This function interacts with OpenAI GPT-3.5 Turbo to extract information from the dataset based on the user's query.
"""
all_texts = []
# Loop through all PDF files in the directory
for pdf_path in glob.glob(f'{pdf_directory}/*.pdf'):
pdf_text = pdf_to_text(pdf_path)
all_texts.append(pdf_text)
combined_text = "\n".join(all_texts)
# Send combined text and query to OpenAI for extraction
prompt = f"Extract relevant information based on the following query: '{query}' from the Madras Music Academy Souvenir archives: {combined_text[:2000]}"
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are an assistant that extracts information from PDF files using OCR."},
{"role": "user", "content": prompt}
],
max_tokens=300
)
# Return the answer from OpenAI GPT-3.5
answer = response['choices'][0]['message']['content']
return answer.strip()
# Define the Gradio interface
def gradio_interface(query):
return extract_info(query)
# Launch the Gradio app
iface = gr.Interface(
fn=gradio_interface,
inputs="text",
outputs="text",
title="Sabha Scholar - Madras Music Academy AI Explorer",
description="Ask questions about the Madras Music Academy Souvenirs."
)
iface.launch()