Spaces:

ojas121
/

ChanakyaBot

Sleeping

App Files Files Community

ojas121 commited on Jan 5, 2025

Commit

b06005a

verified ·

1 Parent(s): e5b3eef

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -89

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
-import os
-import json
-from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
-from datasets import Dataset
-# 1. Training Data Preparation
-raw_text = """
-BOOK I. Concerning Discipline.
 The end of Sciences; association with the aged; restraint of
 the organs of sense; the creation of ministers; the creation of
 councillors and priests; ascertaining by temptations purity or
@@ -15,90 +17,34 @@ state; winning over the factions for or against an enemy's cause in
 an enemy's state; the business of council meeting; the mission of
 envoys; protection of princes; the conduct of a prince kept under
 restraint; treatment of a prince kept under restraint; the duties of a
-king; duty towards the harem; personal safety.
-"""
-# Create synthetic QA dataset
-def generate_qa_pairs(context):
-    questions_answers = [
-        {"question": "What is the end of Sciences?", "answer": "Concerning Discipline"},
-        {"question": "Who should one associate with?", "answer": "The aged"},
-        {"question": "What does the institution of spies involve?", "answer": "Protection of parties for or against one's own cause in one's own state"},
-        {"question": "What are the duties of a king?", "answer": "Duty towards the harem; personal safety"},
-    ]
-    qa_data = [{"context": context, "question": qa["question"], "answers": {"text": [qa["answer"]], "answer_start": [context.find(qa["answer"])]}} for qa in questions_answers]
-    return qa_data
-qa_data = generate_qa_pairs(raw_text)
-# Save data as JSON
-os.makedirs("data", exist_ok=True)
-with open("data/train.json", "w") as f:
-    json.dump(qa_data, f)
-# Load as Hugging Face Dataset
-dataset = Dataset.from_dict({"context": [d["context"] for d in qa_data],
-                             "question": [d["question"] for d in qa_data],
-                             "answers": [d["answers"] for d in qa_data]})
-# 2. Load Pretrained Model and Tokenizer
-model_name = "distilbert-base-cased"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForQuestionAnswering.from_pretrained(model_name)
-# 3. Tokenize Dataset
-def preprocess_data(examples):
-    inputs = tokenizer(
-        examples["question"], examples["context"], truncation=True, padding="max_length", max_length=384
-    )
-    offset_mapping = inputs.pop("offset_mapping")
-    start_positions = []
-    end_positions = []
-    for i, offsets in enumerate(offset_mapping):
-        answer = examples["answers"][i]
-        start_char = answer["answer_start"][0]
-        end_char = start_char + len(answer["text"][0])
-        sequence_ids = inputs.sequence_ids(i)
-        context_start = sequence_ids.index(1)
-        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1
-        if start_char < offsets[context_start][0] or end_char > offsets[context_end][1]:
-            start_positions.append(0)
-            end_positions.append(0)
-        else:
-            start_positions.append(sequence_ids.index(1, context_start, context_end))
-            end_positions.append(sequence_ids.index(1, start_positions[-1] + 1, context_end))
-    inputs["start_positions"] = start_positions
-    inputs["end_positions"] = end_positions
-    return inputs
-tokenized_dataset = dataset.map(preprocess_data, batched=True)
-# 4. Training Arguments
-training_args = TrainingArguments(
-    output_dir="./qa_model",
-    evaluation_strategy="epoch",
-    learning_rate=2e-5,
-    per_device_train_batch_size=4,
-    num_train_epochs=3,
-    weight_decay=0.01,
-    save_strategy="epoch",
-)
-# 5. Trainer Initialization
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=tokenized_dataset,
-    tokenizer=tokenizer,
-)
-# 6. Train the Model
-trainer.train()
-# Save the fine-tuned model
-model.save_pretrained("./qa_model")
-tokenizer.save_pretrained("./qa_model")

+import torch
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+# Load model and tokenizer
+model_name = "distilbert-base-cased-distilled-squad"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+# Example text
+context = """BOOK I. Concerning Discipline.
 The end of Sciences; association with the aged; restraint of
 the organs of sense; the creation of ministers; the creation of
 councillors and priests; ascertaining by temptations purity or
 an enemy's state; the business of council meeting; the mission of
 envoys; protection of princes; the conduct of a prince kept under
 restraint; treatment of a prince kept under restraint; the duties of a
+king; duty towards the harem; personal safety."""
+question = "What is the end of Sciences?"
+# Tokenize input
+inputs = tokenizer(
+    question,
+    context,
+    return_tensors="pt",
+    truncation=True,
+    padding=True,
+    max_length=512,
+    return_offsets_mapping=True  # Ensure this is included
+)
+# Perform inference
+outputs = model(**inputs)
+# Get start and end logits
+start_logits = outputs.start_logits
+end_logits = outputs.end_logits
+# Find the answer
+start_index = torch.argmax(start_logits)
+end_index = torch.argmax(end_logits)
+# Decode answer
+answer = tokenizer.decode(inputs['input_ids'][0][start_index:end_index + 1])
+# Print the result
+print(f"Question: {question}")
+print(f"Answer: {answer}")