Spaces:

ojas121
/

ChanakyaBot

Sleeping

App Files Files Community

ojas121 commited on Jan 5, 2025

Commit

0913650

verified ·

1 Parent(s): e33f9ea

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -44

app.py CHANGED Viewed

@@ -1,15 +1,10 @@
-# Required Libraries
-import re
-import streamlit as st
-from transformers import pipeline
-from sentence_transformers import SentenceTransformer, util
-# Load pre-trained models
-qa_pipeline = pipeline("question-answering")
-sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
-# Load the Chanakya Script
-chanakya_text = """
 BOOK I. Concerning Discipline.
 The end of Sciences; association with the aged; restraint of
 the organs of sense; the creation of ministers; the creation of
@@ -23,35 +18,87 @@ restraint; treatment of a prince kept under restraint; the duties of a
 king; duty towards the harem; personal safety.
 """
-# Preprocess Text
-def preprocess_text(text):
-    sentences = re.split(r'[.;]', text)  # Split into sentences
-    return [sentence.strip() for sentence in sentences if sentence.strip()]
-# Embed the sentences for similarity matching
-def get_embeddings(sentences):
-    return sentence_model.encode(sentences, convert_to_tensor=True)
-# Match Query with Closest Text
-def get_closest_text(query, sentences, embeddings):
-    query_embedding = sentence_model.encode(query, convert_to_tensor=True)
-    scores = util.pytorch_cos_sim(query_embedding, embeddings)
-    closest_idx = scores.argmax().item()
-    return sentences[closest_idx]
-# Preprocess and embed the script
-sentences = preprocess_text(chanakya_text)
-embeddings = get_embeddings(sentences)
-# Streamlit App
-st.title("Chanakya GPT")
-st.write("Ask questions about Chanakya's teachings!")
-user_query = st.text_input("Enter your question:")
-if user_query:
-    closest_sentence = get_closest_text(user_query, sentences, embeddings)
-    st.write(f"**Chanakya Says:** {closest_sentence}")
-    # Using Hugging Face QA model to refine the answer
-    answer = qa_pipeline(question=user_query, context=closest_sentence)
-    st.write(f"**Refined Answer:** {answer['answer']}")

+import os
+import json
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
+from datasets import Dataset
+# 1. Training Data Preparation
+raw_text = """
 BOOK I. Concerning Discipline.
 The end of Sciences; association with the aged; restraint of
 the organs of sense; the creation of ministers; the creation of
 king; duty towards the harem; personal safety.
 """
+# Create synthetic QA dataset
+def generate_qa_pairs(context):
+    questions_answers = [
+        {"question": "What is the end of Sciences?", "answer": "Concerning Discipline"},
+        {"question": "Who should one associate with?", "answer": "The aged"},
+        {"question": "What does the institution of spies involve?", "answer": "Protection of parties for or against one's own cause in one's own state"},
+        {"question": "What are the duties of a king?", "answer": "Duty towards the harem; personal safety"},
+    ]
+    qa_data = [{"context": context, "question": qa["question"], "answers": {"text": [qa["answer"]], "answer_start": [context.find(qa["answer"])]}} for qa in questions_answers]
+    return qa_data
+qa_data = generate_qa_pairs(raw_text)
+# Save data as JSON
+os.makedirs("data", exist_ok=True)
+with open("data/train.json", "w") as f:
+    json.dump(qa_data, f)
+# Load as Hugging Face Dataset
+dataset = Dataset.from_dict({"context": [d["context"] for d in qa_data],
+                             "question": [d["question"] for d in qa_data],
+                             "answers": [d["answers"] for d in qa_data]})
+# 2. Load Pretrained Model and Tokenizer
+model_name = "distilbert-base-cased"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+# 3. Tokenize Dataset
+def preprocess_data(examples):
+    inputs = tokenizer(
+        examples["question"], examples["context"], truncation=True, padding="max_length", max_length=384
+    )
+    offset_mapping = inputs.pop("offset_mapping")
+    start_positions = []
+    end_positions = []
+    for i, offsets in enumerate(offset_mapping):
+        answer = examples["answers"][i]
+        start_char = answer["answer_start"][0]
+        end_char = start_char + len(answer["text"][0])
+        sequence_ids = inputs.sequence_ids(i)
+        context_start = sequence_ids.index(1)
+        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1
+        if start_char < offsets[context_start][0] or end_char > offsets[context_end][1]:
+            start_positions.append(0)
+            end_positions.append(0)
+        else:
+            start_positions.append(sequence_ids.index(1, context_start, context_end))
+            end_positions.append(sequence_ids.index(1, start_positions[-1] + 1, context_end))
+    inputs["start_positions"] = start_positions
+    inputs["end_positions"] = end_positions
+    return inputs
+tokenized_dataset = dataset.map(preprocess_data, batched=True)
+# 4. Training Arguments
+training_args = TrainingArguments(
+    output_dir="./qa_model",
+    evaluation_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=4,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    save_strategy="epoch",
+)
+# 5. Trainer Initialization
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_dataset,
+    tokenizer=tokenizer,
+)
+# 6. Train the Model
+trainer.train()
+# Save the fine-tuned model
+model.save_pretrained("./qa_model")
+tokenizer.save_pretrained("./qa_model")