import gradio as gr from huggingface_hub import InferenceClient import os import torch import transformers from tensorflow import keras from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM,AutoModelForCausalLM from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma from langchain.prompts import ChatPromptTemplate from langchain.schema.runnable import RunnablePassthrough from langchain.schema.output_parser import StrOutputParser from langchain.llms import HuggingFacePipeline import gradio as gr import re from bs4 import BeautifulSoup from guardrails.validators import Validator, register_validator, ValidationResult, FailResult, PassResult from presidio_analyzer import AnalyzerEngine from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpEngineProvider from better_profanity import profanity from presidio_analyzer import PatternRecognizer, Pattern import inflection from guardrails import Guard import warnings # Suppress all warnings warnings.filterwarnings("ignore") """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") print("GPU Available:", torch.cuda.is_available()) print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU Found") pdf_files = ["Apple-10K-2023.pdf", "Apple-10K-2024.pdf"] """### 📌 Step 1: Load Multiple 10-K Financial Report PDFs """ all_documents = [] def preprocess_text(text): # Remove HTML tags text = BeautifulSoup(text, "html.parser").get_text() # Remove extra whitespace and newlines text = re.sub(r'\s+', ' ', text).strip() return text for pdf_path in pdf_files: loader = PyPDFLoader(pdf_path) documents = loader.load() for doc in documents: doc.page_content = preprocess_text(doc.page_content) all_documents.extend(documents) """### 📌 Step 2: Split Text into Chunks
Here each split will also have a metadata defining the location of the chunk in the actual document for citation,also other details.As the pdf text is clean with no html tags etc , we use it as such with no cleaning """ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20) all_splits = text_splitter.split_documents(all_documents) """### 📌 Step 3: Create Embeddings using Sentence Transformers""" # Check if CUDA (GPU) is available; otherwise, use CPU device = "cuda" if torch.cuda.is_available() else "cpu" model_name = "sentence-transformers/all-MiniLM-L6-v2" embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": device}) """### 📌 Step 4: Store & Retrieve using ChromaDB""" vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="content/drive/MyDrive/RAG_DB/chroma_db") retriever = vectordb.as_retriever() # Choose a smaller T5 model model_name = "google/flan-t5-large" # Load model & tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto") # Create Hugging Face pipeline hf_pipeline = pipeline( "text2text-generation", model=model, tokenizer=tokenizer, truncation=True) # Integrate with LangChain llm = HuggingFacePipeline(pipeline=hf_pipeline) """### 📌 Step 6: Define RAG Prompt""" # Define RAG Prompt template = """You are an AI assistant answering financial questions using retrieved financial reports. Use the following retrieved context to answer the question concisely. Question: {question} Context: {context} Answer: """ prompt = ChatPromptTemplate.from_template(template) """### 📌 Step 7: Create RAG pipeline""" # Create RAG Pipeline conversation_chain = ( {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) def to_camel_case(text): """Convert normal text to camelCase using inflection package.""" camel_text = inflection.camelize(text, uppercase_first_letter=True) # Ensure lowerCamelCase return camel_text """### 📌 Step 8: Create a function to get the confidence score""" # Function to Get Confidence Score def get_confidence_score(question): retrieved_docs_with_scores = vectordb.similarity_search_with_score(question, k=5) max_score = max([doc[1] for doc in retrieved_docs_with_scores]) if retrieved_docs_with_scores else 0 return min(1.0, round(max_score, 2)) # Normalize to 0-1 scale ## GuardRail validators # Define NLP Configuration with lang_code nlp_configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}], # Specify language } # Define SSN Pattern ssn_regex = r"\b\d{3}-\d{2}-\d{4}\b" # Matches US SSN format (123-45-6789) ssn_pattern = Pattern(name="SSN Pattern", regex=ssn_regex, score=0.85) # Score between 0-1 # Create Custom SSN Recognizer ssn_recognizer = PatternRecognizer(supported_entity="SSN", patterns=[ssn_pattern]) analyzer = AnalyzerEngine() analyzer.registry.add_recognizer(ssn_recognizer) @register_validator(name="custom_pii_detector", data_type="string") class CustomPIIDetector(Validator): def validate(self, value, metadata={}) -> ValidationResult: # Analyze text for PII results = analyzer.analyze(text=value, entities=["PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD", "SSN"], language="en") if results: detected_entities = ", ".join(set([res.entity_type for res in results])) return FailResult( error_message=f"Query contains PII: {detected_entities}." ) return PassResult() # Custom Profanity Detector using better-profanity @register_validator(name="custom_profanity_detector", data_type="string") class CustomProfanityDetector(Validator): def validate(self, value, metadata={}) -> ValidationResult: if profanity.contains_profanity(value): return FailResult( error_message="Query contains profanity." ) return PassResult() # Custom Relevance Validator for Finance and Apple-related Queries @register_validator(name="custom_relevance_detector", data_type="string") class CustomRelevanceDetector(Validator): def validate(self, value, metadata={}) -> ValidationResult: finance_keywords = {"revenue", "profit", "expenses", "balance sheet", "earnings", "financial", "investment", "dividends", "assets", "liabilities", "cash flow", "loss","turnover"} apple_keywords = {"apple", "iphone", "macbook", "tim cook", "apple inc", "ios", "mac", "ipad"} text_lower = value.lower() # Check if any finance-related or Apple-related keyword appears in the query if not any(keyword in text_lower for keyword in (finance_keywords | apple_keywords)): # Use set union return FailResult( error_message="Query is not related to finance or Apple." ) return PassResult() guard = Guard().use(CustomPIIDetector).use(CustomProfanityDetector).use(CustomRelevanceDetector) """### 📌 Step 10: Integrate with Gradio UI""" # Define Chatbot Function def chat_with_rag(message, history): # try: # res = guard.validate(message) # except Exception as e: # return f"❌ Guardrail {str(e)}" try: response = conversation_chain.invoke(message) confidence_score = get_confidence_score(message) formatted_response = f"**Answer:** {to_camel_case(response)}\n\n**Confidence Score:** {confidence_score:.2f}" return formatted_response except Exception as e: return f"Error: {str(e)}" # A relevant financial question (high-confidence). user_input = "what are the biggest challenges for Apple?" confidence_score = get_confidence_score(user_input) output = conversation_chain.invoke(user_input) print(f"📌 **Answer:** {to_camel_case(output)}\n\n**Confidence Score:** {confidence_score:.2f}") # A relevant financial question (low-confidence). user_input = "what was apple's Total revenue in 2023?" confidence_score = get_confidence_score(user_input) output = conversation_chain.invoke(user_input) print(f"📌 **Answer:** {to_camel_case(output)}\n\n**Confidence Score:** {confidence_score:.2f}") # An irrelevant question (e.g., "What is the capital of France?") to check system robustness. user_input = "What is the capital of France?" output = conversation_chain.invoke(user_input) confidence_score = get_confidence_score(user_input) print(f"📌 **Answer:** {to_camel_case(output)}\n\n**Confidence Score:** {confidence_score:.2f}") # Create Gradio Chatbot UI with Auto-Clearing Input demo = gr.ChatInterface( fn=chat_with_rag, # Function to generate responses title="📊 Financial Basic RAG Chatbot", description="Ask questions about Apple's financial reports and get AI-powered answers!", theme="soft", examples=[ ["What are factors impacting Apple's financial growth?"], ["what was apple's Total revenue in 2023?"], ["What is the capital of France?"], ], submit_btn="Ask", stop_btn=None, ) if __name__ == "__main__": demo.launch()