ojas121 commited on
Commit
0913650
·
verified ·
1 Parent(s): e33f9ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -44
app.py CHANGED
@@ -1,15 +1,10 @@
1
- # Required Libraries
2
- import re
3
- import streamlit as st
4
- from transformers import pipeline
5
- from sentence_transformers import SentenceTransformer, util
6
-
7
- # Load pre-trained models
8
- qa_pipeline = pipeline("question-answering")
9
- sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
10
-
11
- # Load the Chanakya Script
12
- chanakya_text = """
13
  BOOK I. Concerning Discipline.
14
  The end of Sciences; association with the aged; restraint of
15
  the organs of sense; the creation of ministers; the creation of
@@ -23,35 +18,87 @@ restraint; treatment of a prince kept under restraint; the duties of a
23
  king; duty towards the harem; personal safety.
24
  """
25
 
26
- # Preprocess Text
27
- def preprocess_text(text):
28
- sentences = re.split(r'[.;]', text) # Split into sentences
29
- return [sentence.strip() for sentence in sentences if sentence.strip()]
30
-
31
- # Embed the sentences for similarity matching
32
- def get_embeddings(sentences):
33
- return sentence_model.encode(sentences, convert_to_tensor=True)
34
-
35
- # Match Query with Closest Text
36
- def get_closest_text(query, sentences, embeddings):
37
- query_embedding = sentence_model.encode(query, convert_to_tensor=True)
38
- scores = util.pytorch_cos_sim(query_embedding, embeddings)
39
- closest_idx = scores.argmax().item()
40
- return sentences[closest_idx]
41
-
42
- # Preprocess and embed the script
43
- sentences = preprocess_text(chanakya_text)
44
- embeddings = get_embeddings(sentences)
45
-
46
- # Streamlit App
47
- st.title("Chanakya GPT")
48
- st.write("Ask questions about Chanakya's teachings!")
49
-
50
- user_query = st.text_input("Enter your question:")
51
- if user_query:
52
- closest_sentence = get_closest_text(user_query, sentences, embeddings)
53
- st.write(f"**Chanakya Says:** {closest_sentence}")
54
-
55
- # Using Hugging Face QA model to refine the answer
56
- answer = qa_pipeline(question=user_query, context=closest_sentence)
57
- st.write(f"**Refined Answer:** {answer['answer']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
4
+ from datasets import Dataset
5
+
6
+ # 1. Training Data Preparation
7
+ raw_text = """
 
 
 
 
 
8
  BOOK I. Concerning Discipline.
9
  The end of Sciences; association with the aged; restraint of
10
  the organs of sense; the creation of ministers; the creation of
 
18
  king; duty towards the harem; personal safety.
19
  """
20
 
21
+ # Create synthetic QA dataset
22
+ def generate_qa_pairs(context):
23
+ questions_answers = [
24
+ {"question": "What is the end of Sciences?", "answer": "Concerning Discipline"},
25
+ {"question": "Who should one associate with?", "answer": "The aged"},
26
+ {"question": "What does the institution of spies involve?", "answer": "Protection of parties for or against one's own cause in one's own state"},
27
+ {"question": "What are the duties of a king?", "answer": "Duty towards the harem; personal safety"},
28
+ ]
29
+ qa_data = [{"context": context, "question": qa["question"], "answers": {"text": [qa["answer"]], "answer_start": [context.find(qa["answer"])]}} for qa in questions_answers]
30
+ return qa_data
31
+
32
+ qa_data = generate_qa_pairs(raw_text)
33
+
34
+ # Save data as JSON
35
+ os.makedirs("data", exist_ok=True)
36
+ with open("data/train.json", "w") as f:
37
+ json.dump(qa_data, f)
38
+
39
+ # Load as Hugging Face Dataset
40
+ dataset = Dataset.from_dict({"context": [d["context"] for d in qa_data],
41
+ "question": [d["question"] for d in qa_data],
42
+ "answers": [d["answers"] for d in qa_data]})
43
+
44
+ # 2. Load Pretrained Model and Tokenizer
45
+ model_name = "distilbert-base-cased"
46
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
47
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name)
48
+
49
+ # 3. Tokenize Dataset
50
+ def preprocess_data(examples):
51
+ inputs = tokenizer(
52
+ examples["question"], examples["context"], truncation=True, padding="max_length", max_length=384
53
+ )
54
+ offset_mapping = inputs.pop("offset_mapping")
55
+ start_positions = []
56
+ end_positions = []
57
+
58
+ for i, offsets in enumerate(offset_mapping):
59
+ answer = examples["answers"][i]
60
+ start_char = answer["answer_start"][0]
61
+ end_char = start_char + len(answer["text"][0])
62
+
63
+ sequence_ids = inputs.sequence_ids(i)
64
+ context_start = sequence_ids.index(1)
65
+ context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1
66
+
67
+ if start_char < offsets[context_start][0] or end_char > offsets[context_end][1]:
68
+ start_positions.append(0)
69
+ end_positions.append(0)
70
+ else:
71
+ start_positions.append(sequence_ids.index(1, context_start, context_end))
72
+ end_positions.append(sequence_ids.index(1, start_positions[-1] + 1, context_end))
73
+
74
+ inputs["start_positions"] = start_positions
75
+ inputs["end_positions"] = end_positions
76
+ return inputs
77
+
78
+ tokenized_dataset = dataset.map(preprocess_data, batched=True)
79
+
80
+ # 4. Training Arguments
81
+ training_args = TrainingArguments(
82
+ output_dir="./qa_model",
83
+ evaluation_strategy="epoch",
84
+ learning_rate=2e-5,
85
+ per_device_train_batch_size=4,
86
+ num_train_epochs=3,
87
+ weight_decay=0.01,
88
+ save_strategy="epoch",
89
+ )
90
+
91
+ # 5. Trainer Initialization
92
+ trainer = Trainer(
93
+ model=model,
94
+ args=training_args,
95
+ train_dataset=tokenized_dataset,
96
+ tokenizer=tokenizer,
97
+ )
98
+
99
+ # 6. Train the Model
100
+ trainer.train()
101
+
102
+ # Save the fine-tuned model
103
+ model.save_pretrained("./qa_model")
104
+ tokenizer.save_pretrained("./qa_model")