import os from openai import OpenAI import time import gradio as gr import json # Get the API key from environment variable api_key = os.environ.get('OPEN_API_KEY') # Initialize the OpenAI client client = OpenAI(api_key=api_key) def generate_qa_pairs(chunk, num_pairs=2): prompt = f"""Given the following text, generate {num_pairs} question-answer pairs. Ensure the questions and answers capture key ideas from the text. Text: {chunk} Format each pair as: Q: [Question] A: [Answer] """ try: response = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Given the following text, generate question-answer pairs."}, {"role": "user", "content": prompt} ], temperature=0.8, max_tokens=500 ) return response.choices[0].message.content except Exception as e: print(f"An error occurred: {e}") time.sleep(20) # Wait for 20 seconds before retrying return generate_qa_pairs(chunk, num_pairs) # Retry def process_qa_pairs(qa_text, chunk, id_prefix="dod5000"): pairs = [] qa_split = qa_text.split('Q: ') for i, qa in enumerate(qa_split[1:]): # Skip the first empty split q, a = qa.split('A: ') pairs.append({ "id": f"{id_prefix}-{i:03d}", "question": q.strip(), "context": chunk, "answers": [{"text": a.strip(), "answer_start": None}] # Note: answer_start is not directly available from the OpenAI response }) return pairs def process_document(file, num_pairs): try: with open(file.name, 'r') as f: text = f.read() chunks = [text[i:i+500] for i in range(0, len(text), 500)] # Assuming chunks of 500 characters qa_pairs = [] for i, chunk in enumerate(chunks): print(f"Processing chunk {i+1} of {len(chunks)}") pairs = generate_qa_pairs(chunk, num_pairs) qa_pairs.extend(process_qa_pairs(pairs, chunk, id_prefix=f"dod{i+1}000")) time.sleep(3) # To avoid hitting rate limits return qa_pairs except Exception as e: print(f"An error occurred: {e}") return [] def save_to_json(qa_pairs): with open('qa_pairs.json', 'w') as file: json.dump(qa_pairs, file, indent=4) with open('total_pairs.json', 'w') as file: json.dump({"total_pairs": len(qa_pairs)}, file, indent=4) def main(file, num_pairs): qa_pairs = process_document(file, num_pairs) save_to_json(qa_pairs) return f"Total number of Q&A pairs generated: {len(qa_pairs)}" with gr.Blocks() as demo: file_input = gr.File(label="Upload Document") num_pairs_input = gr.Number(label="Number of Pairs per Chunk", value=2) output_label = gr.Label(label="Output") gr.Button("Generate QA Pairs").click( main, inputs=[file_input, num_pairs_input], outputs=output_label ) demo.launch()