dindizz commited on
Commit
96a45e3
·
verified ·
1 Parent(s): e63b931

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -17
app.py CHANGED
@@ -1,41 +1,59 @@
1
  import openai
2
  import gradio as gr
3
- import pandas as pd
4
- import os # Importing os to access environment variables
5
  from datasets import load_dataset
6
-
7
- # Load the dataset from Hugging Face
8
- dataset = load_dataset('https://huggingface.co/spaces/dindizz/musicacademyarchives')
9
 
10
  # Access the OpenAI API key from environment variables (Hugging Face secret)
11
  openai.api_key = os.getenv('OPENAI_API_KEY')
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def extract_info(query):
14
  """
15
  This function interacts with OpenAI GPT-3.5 Turbo to extract information from the dataset based on the user's query.
16
  """
17
- # Extracting the text content from the dataset to pass as context
18
- all_souvenirs = []
 
19
  for item in dataset['train']:
20
- souvenir_text = item['text'] # Assuming the column name is 'text' containing the content
21
- all_souvenirs.append(souvenir_text)
 
22
 
23
- # Combine the content into a single string (you can adjust based on the size of the dataset)
24
- combined_souvenir_text = "\n".join(all_souvenirs)
25
 
26
- # Prompt OpenAI GPT-3.5 with the user's query and the combined text
27
- prompt = f"Extract relevant information based on the following query: '{query}' from the Madras Music Academy Souvenir archives: {combined_souvenir_text[:2000]}" # limiting the length for performance
28
 
29
  response = openai.ChatCompletion.create(
30
- model="gpt-3.5-turbo", # Updated model
31
  messages=[
32
- {"role": "system", "content": "You are an assistant that extracts information from the Madras Music Academy Souvenir dataset and present in a friendly tone ."},
33
  {"role": "user", "content": prompt}
34
  ],
35
  max_tokens=300
36
  )
37
 
38
- # Returning the answer from OpenAI GPT-3.5 Turbo
39
  answer = response['choices'][0]['message']['content']
40
  return answer.strip()
41
 
@@ -49,7 +67,7 @@ iface = gr.Interface(
49
  inputs="text",
50
  outputs="text",
51
  title="Sabha Scholar - Madras Music Academy AI Explorer",
52
- description="Ask questions about the Madras Music Academy Souvenirs and extract information using OpenAI GPT-3.5 Turbo."
53
  )
54
 
55
  iface.launch()
 
1
  import openai
2
  import gradio as gr
3
+ import os
 
4
  from datasets import load_dataset
5
+ from pdf2image import convert_from_path
6
+ import pytesseract
7
+ from PIL import Image
8
 
9
  # Access the OpenAI API key from environment variables (Hugging Face secret)
10
  openai.api_key = os.getenv('OPENAI_API_KEY')
11
 
12
+ # Function to convert PDF to images and apply OCR
13
+ def pdf_to_text(pdf_path):
14
+ """
15
+ Converts PDF pages to images and extracts text using OCR.
16
+ """
17
+ images = convert_from_path(pdf_path)
18
+ full_text = ""
19
+
20
+ for image in images:
21
+ # Perform OCR on each image
22
+ text = pytesseract.image_to_string(image)
23
+ full_text += text + "\n"
24
+
25
+ return full_text
26
+
27
+ # Load the dataset from Hugging Face (adjust to point to your dataset)
28
+ dataset = load_dataset('dindizz/musicacademyarchives')
29
+
30
  def extract_info(query):
31
  """
32
  This function interacts with OpenAI GPT-3.5 Turbo to extract information from the dataset based on the user's query.
33
  """
34
+ all_texts = []
35
+
36
+ # Loop through the PDF files in the dataset
37
  for item in dataset['train']:
38
+ pdf_path = item['file'] # Adjust based on the dataset structure
39
+ pdf_text = pdf_to_text(pdf_path)
40
+ all_texts.append(pdf_text)
41
 
42
+ combined_text = "\n".join(all_texts)
 
43
 
44
+ # Send combined text and query to OpenAI for extraction
45
+ prompt = f"Extract relevant information based on the following query: '{query}' from the Madras Music Academy Souvenir archives: {combined_text[:2000]}"
46
 
47
  response = openai.ChatCompletion.create(
48
+ model="gpt-3.5-turbo",
49
  messages=[
50
+ {"role": "system", "content": "You are an assistant that extracts information from PDF files using OCR."},
51
  {"role": "user", "content": prompt}
52
  ],
53
  max_tokens=300
54
  )
55
 
56
+ # Return the answer from OpenAI GPT-3.5
57
  answer = response['choices'][0]['message']['content']
58
  return answer.strip()
59
 
 
67
  inputs="text",
68
  outputs="text",
69
  title="Sabha Scholar - Madras Music Academy AI Explorer",
70
+ description="Ask questions about the Madras Music Academy Souvenirs. Extract information using OCR and OpenAI GPT-3.5 Turbo."
71
  )
72
 
73
  iface.launch()