File size: 3,075 Bytes
b4e9e1e
35c3e58
 
 
 
 
6b18659
 
 
35c3e58
d37bb88
35c3e58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
from openai import OpenAI
import time
import gradio as gr
import json

# Get the API key from environment variable
api_key = os.environ.get('OPEN_API_KEY')

# Initialize the OpenAI client
client = OpenAI(api_key=api_key)

def generate_qa_pairs(chunk, num_pairs=2):
    prompt = f"""Given the following text, generate {num_pairs} question-answer pairs.
    Ensure the questions and answers capture key ideas from the text.

    Text: {chunk}

    Format each pair as:
    Q: [Question]
    A: [Answer]
    """

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Given the following text, generate question-answer pairs."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.8,
            max_tokens=500
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"An error occurred: {e}")
        time.sleep(20)  # Wait for 20 seconds before retrying
        return generate_qa_pairs(chunk, num_pairs)  # Retry

def process_qa_pairs(qa_text, chunk, id_prefix="dod5000"):
    pairs = []
    qa_split = qa_text.split('Q: ')
    for i, qa in enumerate(qa_split[1:]):  # Skip the first empty split
        q, a = qa.split('A: ')
        pairs.append({
            "id": f"{id_prefix}-{i:03d}",
            "question": q.strip(),
            "context": chunk,
            "answers": [{"text": a.strip(), "answer_start": None}]  # Note: answer_start is not directly available from the OpenAI response
        })
    return pairs

def process_document(file, num_pairs):
    try:
        with open(file.name, 'r') as f:
            text = f.read()
            chunks = [text[i:i+500] for i in range(0, len(text), 500)]  # Assuming chunks of 500 characters
            qa_pairs = []
            for i, chunk in enumerate(chunks):
                print(f"Processing chunk {i+1} of {len(chunks)}")
                pairs = generate_qa_pairs(chunk, num_pairs)
                qa_pairs.extend(process_qa_pairs(pairs, chunk, id_prefix=f"dod{i+1}000"))
                time.sleep(3)  # To avoid hitting rate limits
            return qa_pairs
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def save_to_json(qa_pairs):
    with open('qa_pairs.json', 'w') as file:
        json.dump(qa_pairs, file, indent=4)
    with open('total_pairs.json', 'w') as file:
        json.dump({"total_pairs": len(qa_pairs)}, file, indent=4)

def main(file, num_pairs):
    qa_pairs = process_document(file, num_pairs)
    save_to_json(qa_pairs)
    return f"Total number of Q&A pairs generated: {len(qa_pairs)}"

with gr.Blocks() as demo:
    file_input = gr.File(label="Upload Document")
    num_pairs_input = gr.Number(label="Number of Pairs per Chunk", value=2)
    output_label = gr.Label(label="Output")

    gr.Button("Generate QA Pairs").click(
        main,
        inputs=[file_input, num_pairs_input],
        outputs=output_label
    )

demo.launch()