Spaces:
Sleeping
Sleeping
File size: 2,153 Bytes
a90a149 96a45e3 9668d23 a90a149 9668d23 96a45e3 a90a149 96a45e3 9668d23 96a45e3 a90a149 96a45e3 a90a149 96a45e3 a90a149 96a45e3 a90a149 96a45e3 a90a149 96a45e3 a90a149 4bb5912 a90a149 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import openai
import gradio as gr
import os
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import glob
# Access the OpenAI API key from environment variables (Hugging Face secret)
openai.api_key = os.getenv('OPENAI_API_KEY')
# Directory where the PDF files are stored
pdf_directory = '/path_to_pdf_files' # Change this to your actual dataset path
def pdf_to_text(pdf_path):
"""
Converts PDF pages to images and extracts text using OCR.
"""
images = convert_from_path(pdf_path)
full_text = ""
for image in images:
# Perform OCR on each image
text = pytesseract.image_to_string(image)
full_text += text + "\n"
return full_text
def extract_info(query):
"""
This function interacts with OpenAI GPT-3.5 Turbo to extract information from the dataset based on the user's query.
"""
all_texts = []
# Loop through all PDF files in the directory
for pdf_path in glob.glob(f'{pdf_directory}/*.pdf'):
pdf_text = pdf_to_text(pdf_path)
all_texts.append(pdf_text)
combined_text = "\n".join(all_texts)
# Send combined text and query to OpenAI for extraction
prompt = f"Extract relevant information based on the following query: '{query}' from the Madras Music Academy Souvenir archives: {combined_text[:2000]}"
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are an assistant that extracts information from PDF files using OCR."},
{"role": "user", "content": prompt}
],
max_tokens=300
)
# Return the answer from OpenAI GPT-3.5
answer = response['choices'][0]['message']['content']
return answer.strip()
# Define the Gradio interface
def gradio_interface(query):
return extract_info(query)
# Launch the Gradio app
iface = gr.Interface(
fn=gradio_interface,
inputs="text",
outputs="text",
title="Sabha Scholar - Madras Music Academy AI Explorer",
description="Ask questions about the Madras Music Academy Souvenirs."
)
iface.launch()
|