Upload 6 files

Browse files

Files changed (6) hide show

dataset_processing.py +27 -0
fine_tuned_tax +0 -0
finetune_tinyllama.py +58 -0
processed_dataset.json +82 -0
tax_train_data.json +82 -0
taxagent.py +69 -85

dataset_processing.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from datasets import load_dataset
+from transformers import AutoTokenizer
+# Model name
+MODEL_NAME = "/falcon-7b"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+# Load dataset
+dataset = load_dataset("json", data_files="tax_train_data.json")  # Replace with actual dataset
+# Preprocessing function
+def preprocess_function(examples):
+    inputs = examples["prompt"]  # Get prompt text
+    targets = examples["response"]  # Get response text
+    # Tokenize both inputs and targets
+    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
+    labels = tokenizer(targets, padding="max_length", truncation=True, max_length=512)
+    model_inputs["labels"] = labels["input_ids"]  # Add labels to dataset
+    return model_inputs
+# Apply preprocessing to dataset
+processed_dataset = dataset.map(preprocess_function, batched=True)
+# Save processed dataset
+processed_dataset.save_to_disk("processed_dataset.json")

fine_tuned_tax ADDED Viewed

File without changes

finetune_tinyllama.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
+from datasets import load_dataset
+# Model name
+MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    torch_dtype=torch.float16,  # Use float16 for better efficiency
+    device_map="auto"  # Use GPU if available
+)
+# Load dataset from JSON file
+dataset = load_dataset("json", data_files="processed_dataset.json")
+# Tokenization function
+def tokenize_function(examples):
+    return tokenizer(examples["prompt"], examples["response"], padding="max_length", truncation=True)
+# Apply tokenization
+dataset = dataset.map(tokenize_function, batched=True)
+dataset = dataset.remove_columns(["prompt", "response"])  # Keep only tokenized data
+# Data collator (for batching and padding)
+data_collator = DataCollatorForSeq2Seq(
+    tokenizer=tokenizer,
+    model=model,
+    padding=True,
+    return_tensors="pt"
+)
+# Training arguments
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=3,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=4,
+    save_steps=10_000,
+    save_total_limit=2,
+    logging_dir="./logs",
+    logging_steps=200,
+    remove_unused_columns=False,  # Ensure tokenized data isn't removed
+    fp16=True,  # Enable mixed precision if using GPU
+)
+# Trainer setup
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    data_collator=data_collator,
+)
+# Start training
+trainer.train()

processed_dataset.json ADDED Viewed

	@@ -0,0 +1,82 @@

+[
+    {
+        "prompt": "Calculate tax for ₹10,00,000 at 30% rate.",
+        "response": "The tax is ₹3,00,000."
+    },
+    {
+        "prompt": "Explain Section 80C of the Income Tax Act.",
+        "response": "Section 80C allows deductions up to ₹1,50,000 on PPF, EPF, and life insurance."
+    },
+    {
+        "prompt": "What is the tax on ₹8,50,000 with a 20% slab?",
+        "response": "The tax is ₹1,70,000."
+    },
+    {
+        "prompt": "How does the new tax regime differ from the old tax regime?",
+        "response": "The new tax regime has lower tax rates but fewer deductions, while the old regime allows more exemptions."
+    },
+    {
+        "prompt": "What is the exemption limit under the new tax regime for FY 2023-24?",
+        "response": "The exemption limit is ₹3,00,000 under the new tax regime."
+    },
+    {
+        "prompt": "Is HRA exempt from income tax?",
+        "response": "Yes, House Rent Allowance (HRA) is exempt under Section 10(13A) based on salary, rent paid, and location."
+    },
+    {
+        "prompt": "How to save tax under Section 80D?",
+        "response": "Section 80D allows deductions on health insurance premiums up to ₹25,000 (₹50,000 for senior citizens)."
+    },
+    {
+        "prompt": "What is the capital gains tax on the sale of property?",
+        "response": "Long-term capital gains (LTCG) on property are taxed at 20% with indexation, while short-term gains are taxed as per the income slab."
+    },
+    {
+        "prompt": "Can I claim deductions on home loan interest?",
+        "response": "Yes, under Section 24(b), you can claim up to ₹2,00,000 per year on home loan interest."
+    },
+    {
+        "prompt": "What is the GST rate on restaurant bills?",
+        "response": "The GST rate on restaurant bills is 5% for non-AC restaurants and 18% for AC restaurants."
+    },
+    {
+        "prompt": "What is TDS and when is it deducted?",
+        "response": "Tax Deducted at Source (TDS) is deducted by the payer on salaries, rent, and interest payments as per prescribed rates."
+    },
+    {
+        "prompt": "How can NRIs save tax in India?",
+        "response": "NRIs can save tax through DTAA benefits, NRE accounts, and exemptions on certain investments."
+    },
+    {
+        "prompt": "What is the corporate tax rate in India?",
+        "response": "The corporate tax rate is 22% for domestic companies under the new regime and 30% under the old regime."
+    },
+    {
+        "prompt": "Are agricultural incomes taxable?",
+        "response": "No, agricultural income is exempt from tax under Section 10(1)."
+    },
+    {
+        "prompt": "What are the penalties for late ITR filing?",
+        "response": "A late fee of ₹5,000 applies if filed after the due date, and ₹10,000 for income above ₹5 lakh."
+    },
+    {
+        "prompt": "Explain Section 80G of the Income Tax Act.",
+        "response": "Section 80G allows deductions on donations made to eligible charities, ranging from 50% to 100% of the donation."
+    },
+    {
+        "prompt": "What is Advance Tax, and who needs to pay it?",
+        "response": "Advance Tax is payable if total tax liability exceeds ₹10,000 in a financial year and is paid in installments."
+    },
+    {
+        "prompt": "What is the basic exemption limit for senior citizens?",
+        "response": "The exemption limit for senior citizens (60-80 years) is ₹3,00,000 and ₹5,00,000 for super senior citizens."
+    },
+    {
+        "prompt": "How does tax loss harvesting work?",
+        "response": "Tax loss harvesting helps offset capital gains by selling loss-making stocks to reduce taxable income."
+    },
+    {
+        "prompt": "What is the standard deduction for salaried employees?",
+        "response": "A standard deduction of ₹50,000 is available for salaried and pensioned individuals."
+    }
+]

tax_train_data.json ADDED Viewed

	@@ -0,0 +1,82 @@

+[
+    {
+        "prompt": "Calculate tax for ₹10,00,000 at 30% rate.",
+        "response": "The tax is ₹3,00,000."
+    },
+    {
+        "prompt": "Explain Section 80C of the Income Tax Act.",
+        "response": "Section 80C allows deductions up to ₹1,50,000 on PPF, EPF, and life insurance."
+    },
+    {
+        "prompt": "What is the tax on ₹8,50,000 with a 20% slab?",
+        "response": "The tax is ₹1,70,000."
+    },
+    {
+        "prompt": "How does the new tax regime differ from the old tax regime?",
+        "response": "The new tax regime has lower tax rates but fewer deductions, while the old regime allows more exemptions."
+    },
+    {
+        "prompt": "What is the exemption limit under the new tax regime for FY 2023-24?",
+        "response": "The exemption limit is ₹3,00,000 under the new tax regime."
+    },
+    {
+        "prompt": "Is HRA exempt from income tax?",
+        "response": "Yes, House Rent Allowance (HRA) is exempt under Section 10(13A) based on salary, rent paid, and location."
+    },
+    {
+        "prompt": "How to save tax under Section 80D?",
+        "response": "Section 80D allows deductions on health insurance premiums up to ₹25,000 (₹50,000 for senior citizens)."
+    },
+    {
+        "prompt": "What is the capital gains tax on the sale of property?",
+        "response": "Long-term capital gains (LTCG) on property are taxed at 20% with indexation, while short-term gains are taxed as per the income slab."
+    },
+    {
+        "prompt": "Can I claim deductions on home loan interest?",
+        "response": "Yes, under Section 24(b), you can claim up to ₹2,00,000 per year on home loan interest."
+    },
+    {
+        "prompt": "What is the GST rate on restaurant bills?",
+        "response": "The GST rate on restaurant bills is 5% for non-AC restaurants and 18% for AC restaurants."
+    },
+    {
+        "prompt": "What is TDS and when is it deducted?",
+        "response": "Tax Deducted at Source (TDS) is deducted by the payer on salaries, rent, and interest payments as per prescribed rates."
+    },
+    {
+        "prompt": "How can NRIs save tax in India?",
+        "response": "NRIs can save tax through DTAA benefits, NRE accounts, and exemptions on certain investments."
+    },
+    {
+        "prompt": "What is the corporate tax rate in India?",
+        "response": "The corporate tax rate is 22% for domestic companies under the new regime and 30% under the old regime."
+    },
+    {
+        "prompt": "Are agricultural incomes taxable?",
+        "response": "No, agricultural income is exempt from tax under Section 10(1)."
+    },
+    {
+        "prompt": "What are the penalties for late ITR filing?",
+        "response": "A late fee of ₹5,000 applies if filed after the due date, and ₹10,000 for income above ₹5 lakh."
+    },
+    {
+        "prompt": "Explain Section 80G of the Income Tax Act.",
+        "response": "Section 80G allows deductions on donations made to eligible charities, ranging from 50% to 100% of the donation."
+    },
+    {
+        "prompt": "What is Advance Tax, and who needs to pay it?",
+        "response": "Advance Tax is payable if total tax liability exceeds ₹10,000 in a financial year and is paid in installments."
+    },
+    {
+        "prompt": "What is the basic exemption limit for senior citizens?",
+        "response": "The exemption limit for senior citizens (60-80 years) is ₹3,00,000 and ₹5,00,000 for super senior citizens."
+    },
+    {
+        "prompt": "How does tax loss harvesting work?",
+        "response": "Tax loss harvesting helps offset capital gains by selling loss-making stocks to reduce taxable income."
+    },
+    {
+        "prompt": "What is the standard deduction for salaried employees?",
+        "response": "A standard deduction of ₹50,000 is available for salaried and pensioned individuals."
+    }
+]

taxagent.py CHANGED Viewed

@@ -1,91 +1,82 @@
 import streamlit as st
 import fitz  # PyMuPDF for PDF extraction
-from langchain_community.llms import Ollama
-from langchain.chains import LLMChain
-from langchain.prompts import PromptTemplate
-from langchain.memory import ConversationBufferMemory
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.vectorstores import FAISS
 from langchain.embeddings import OllamaEmbeddings
-import hashlib
-import numpy as np
 # ========================== SESSION STATE INITIALIZATION ========================== #
-if "memory" not in st.session_state:
-    st.session_state.memory = ConversationBufferMemory()
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = []
 if "legal_knowledge_base" not in st.session_state:
     st.session_state.legal_knowledge_base = ""
-if "user_query" not in st.session_state:
-    st.session_state.user_query = ""
-if "answer" not in st.session_state:
-    st.session_state.answer = ""
 if "vector_db" not in st.session_state:
     st.session_state.vector_db = None
 if "summary" not in st.session_state:
     st.session_state.summary = ""
-if "doc_hash" not in st.session_state:
-    st.session_state.doc_hash = ""
 # ========================== HELPER FUNCTIONS ========================== #
 def compute_file_hash(file):
-    """Computes SHA-256 hash of the uploaded file to check for changes."""
     hasher = hashlib.sha256()
     hasher.update(file.read())
-    file.seek(0)  # Reset file pointer after reading
     return hasher.hexdigest()
 def extract_text_from_pdf(pdf_file):
-    """Extracts text from a PDF file using PyMuPDF (fitz)."""
-    try:
-        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
-        pdf_file.seek(0)  # Reset file pointer
-        text = "\n".join([page.get_text("text") for page in doc])
-        return text.strip() if text.strip() else "No extractable text found in PDF."
-    except Exception as e:
-        return f"Error reading PDF: {e}"
 def summarize_text(text):
-    """Summarizes the extracted legal document using AI."""
-    llm = Ollama(model="llama3:8b")
-    prompt = PromptTemplate(
-        input_variables=["text"],
-        template="Summarize this tax policy document concisely:\n{text}"
-    )
-    chain = LLMChain(llm=llm, prompt=prompt)
-    summary = chain.run(text=text)
     return summary
 def create_vector_db():
-    """Converts the extracted legal document into searchable vector embeddings."""
     text = st.session_state.legal_knowledge_base
     if not text:
         return None
     text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=150)
     texts = text_splitter.split_text(text)
-    embeddings = OllamaEmbeddings(model="llama3")
     return FAISS.from_texts(texts, embeddings)
 def retrieve_relevant_text(query, vector_db):
-    """Fetches relevant sections from the document based on the user's query."""
     if not vector_db:
-        return "No legal document uploaded."
     docs = vector_db.similarity_search(query, k=5)
     retrieved_text = "\n".join([doc.page_content for doc in docs])
     return retrieved_text
-# ========================== AI TAX COMPUTATION & REASONING ========================== #
 def compute_tax_details(query):
-    """Processes user queries related to tax calculations."""
     import re
-    # Extract income & tax rate from query
     income_match = re.search(r"₹?(\d[\d,]*)", query.replace(",", ""))
     tax_rate_match = re.search(r"(\d+)%", query)
@@ -94,77 +85,70 @@ def compute_tax_details(query):
         tax_rate = float(tax_rate_match.group(1))
         computed_tax = round(income * (tax_rate / 100), 2)
-        return f"Based on an income of ₹{income:,.2f} and a tax rate of {tax_rate}%, the calculated tax is **₹{computed_tax:,.2f}.**"
     return None
 def answer_user_query(query):
-    """Answers user queries using retrieved legal text & tax calculations."""
     tax_computation_result = compute_tax_details(query)
     if tax_computation_result:
         st.session_state.answer = tax_computation_result
-        st.session_state.chat_history.append({"query": query, "response": st.session_state.answer})
         return
     if not st.session_state.vector_db:
         st.error("Please upload a document first.")
         return
-    llm = Ollama(model="llama3:8b")
     retrieved_text = retrieve_relevant_text(query, st.session_state.vector_db)
-    combined_context = f"Laws:\n{retrieved_text}\n\nUser Query:\n{query}"
-    prompt_template = PromptTemplate(
-        input_variables=["input_text"],
-        template="""
-        You are an AI legal expert specializing in tax and finance. Answer the user's query using legal context & real-world tax computation.
-        Context:
-        {input_text}
-        """
-    )
-    chain = LLMChain(llm=llm, prompt=prompt_template, memory=st.session_state.memory)
-    st.session_state.answer = chain.run(input_text=combined_context)
-    st.session_state.chat_history.append({"query": query, "response": st.session_state.answer})
-# ========================== MAIN STREAMLIT APP ========================== #
 def main():
     st.title("📜 AI Legal Tax Assistant")
-    uploaded_file = st.file_uploader("📄 Upload Policy PDF", type=["pdf"])
     if uploaded_file:
-        file_hash = compute_file_hash(uploaded_file)
-        if file_hash != st.session_state.doc_hash:
-            st.session_state.doc_hash = file_hash
-            with st.spinner("Extracting text..."):
-                extracted_text = extract_text_from_pdf(uploaded_file)
-                st.session_state.legal_knowledge_base = extracted_text
-                st.success("Policy Document Uploaded & Stored!")
-            with st.spinner("Generating summary..."):
-                st.session_state.summary = summarize_text(extracted_text)
-                st.subheader("📄 Document Summary:")
-                st.text_area("", st.session_state.summary, height=250)
-            with st.spinner("Indexing document for Q&A..."):
-                st.session_state.vector_db = create_vector_db()
-                st.success("Document indexed! Now you can ask questions.")
     st.subheader("💬 Ask Questions:")
-    st.session_state.user_query = st.text_input("Enter your question:")
-    if st.button("Ask") and st.session_state.user_query.strip():
-        with st.spinner("Thinking..."):
-            answer_user_query(st.session_state.user_query)
     if st.session_state.answer:
         st.markdown("### 🤖 AI Response:")
         st.success(st.session_state.answer)
 if __name__ == "__main__":
-    main()

 import streamlit as st
 import fitz  # PyMuPDF for PDF extraction
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import hashlib
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.vectorstores import FAISS
 from langchain.embeddings import OllamaEmbeddings
+# ========================== LOAD FINE-TUNED MODEL ========================== #
+MODEL_PATH = "./fine_tuned_tinyllama_tax"  # Change to your actual model path
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+tax_llm = pipeline("text-generation", model=model, tokenizer=tokenizer)
 # ========================== SESSION STATE INITIALIZATION ========================== #
 if "legal_knowledge_base" not in st.session_state:
     st.session_state.legal_knowledge_base = ""
 if "vector_db" not in st.session_state:
     st.session_state.vector_db = None
 if "summary" not in st.session_state:
     st.session_state.summary = ""
+if "answer" not in st.session_state:
+    st.session_state.answer = ""
 # ========================== HELPER FUNCTIONS ========================== #
 def compute_file_hash(file):
+    """Computes SHA-256 hash of the uploaded file to track changes."""
     hasher = hashlib.sha256()
     hasher.update(file.read())
+    file.seek(0)  # Reset file pointer
     return hasher.hexdigest()
 def extract_text_from_pdf(pdf_file):
+    """Extracts text from a PDF using PyMuPDF (fitz)."""
+    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    pdf_file.seek(0)  # Reset pointer
+    text = "\n".join([page.get_text("text") for page in doc])
+    return text.strip() if text.strip() else "No extractable text found in PDF."
 def summarize_text(text):
+    """Summarizes tax policy documents using fine-tuned AI."""
+    prompt = f"Summarize this tax policy document concisely:\n{text}"
+    summary = tax_llm(prompt, max_length=200, do_sample=True)[0]["generated_text"]
     return summary
 def create_vector_db():
+    """Creates a searchable vector database from extracted legal documents."""
     text = st.session_state.legal_knowledge_base
     if not text:
         return None
     text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=150)
     texts = text_splitter.split_text(text)
+    embeddings = OllamaEmbeddings(model="llama3:8b")
     return FAISS.from_texts(texts, embeddings)
 def retrieve_relevant_text(query, vector_db):
+    """Fetches relevant legal sections from the document."""
     if not vector_db:
+        return "No document uploaded."
     docs = vector_db.similarity_search(query, k=5)
     retrieved_text = "\n".join([doc.page_content for doc in docs])
     return retrieved_text
 def compute_tax_details(query):
+    """Extracts income & tax rate and calculates tax."""
     import re
     income_match = re.search(r"₹?(\d[\d,]*)", query.replace(",", ""))
     tax_rate_match = re.search(r"(\d+)%", query)
         tax_rate = float(tax_rate_match.group(1))
         computed_tax = round(income * (tax_rate / 100), 2)
+        return f"Based on an income of ₹{income:,.2f} and a tax rate of {tax_rate}%, the tax is **₹{computed_tax:,.2f}.**"
     return None
 def answer_user_query(query):
+    """Answers tax-related queries using the fine-tuned model."""
     tax_computation_result = compute_tax_details(query)
     if tax_computation_result:
         st.session_state.answer = tax_computation_result
         return
     if not st.session_state.vector_db:
         st.error("Please upload a document first.")
         return
     retrieved_text = retrieve_relevant_text(query, st.session_state.vector_db)
+    prompt = f"""
+    You are an AI tax expert. Use legal knowledge and tax calculations to answer.
+    Context:
+    {retrieved_text}
+    User Query:
+    {query}
+    Response:
+    """
+    response = tax_llm(prompt, max_length=300, do_sample=True)[0]["generated_text"]
+    st.session_state.answer = response
+# ========================== STREAMLIT UI ========================== #
 def main():
     st.title("📜 AI Legal Tax Assistant")
+    uploaded_file = st.file_uploader("📄 Upload Tax Policy PDF", type=["pdf"])
     if uploaded_file:
+        with st.spinner("Extracting text..."):
+            extracted_text = extract_text_from_pdf(uploaded_file)
+            st.session_state.legal_knowledge_base = extracted_text
+            st.success("Document Uploaded!")
+        with st.spinner("Generating summary..."):
+            st.session_state.summary = summarize_text(extracted_text)
+            st.subheader("📄 Document Summary:")
+            st.text_area("", st.session_state.summary, height=250)
+        with st.spinner("Indexing document..."):
+            st.session_state.vector_db = create_vector_db()
+            st.success("Document indexed! Ask questions now.")
     st.subheader("💬 Ask Questions:")
+    user_query = st.text_input("Enter your question:")
+    if st.button("Ask") and user_query.strip():
+        with st.spinner("Processing..."):
+            answer_user_query(user_query)
     if st.session_state.answer:
         st.markdown("### 🤖 AI Response:")
         st.success(st.session_state.answer)
 if __name__ == "__main__":
+    main()