#R&D import gradio as gr import time #correct with history from huggingface_hub import hf_hub_download import os from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings from transformers import pipeline import shutil import re import json from datetime import datetime from collections import deque user_repo_id = "manabb/nrl" msg = "" # History storage HISTORY = [] embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") faiss_path = hf_hub_download(repo_id=user_repo_id, filename="index.faiss", repo_type="dataset") pkl_path = hf_hub_download(repo_id=user_repo_id, filename="index.pkl", repo_type="dataset") folder_path = os.path.dirname(faiss_path) vectorstore = FAISS.load_local(folder_path, embeddings, allow_dangerous_deserialization=True) print(f"✅ Vectorstore: {vectorstore.index.ntotal} docs") retriever = vectorstore.as_retriever(search_kwargs={"k": 10}) summarizer = pipeline("summarization", model="google/flan-t5-small", device_map="cpu") #=========================================TAB-1-START====================================== def intelligently_show_context_with_pages_resources(context, query, docs, top_n=3): """ Intelligently extract paragraphs with PAGE NUMBERS + RESOURCE names """ display_context = [] display_context.append("📄 ****\n") display_context.append("=" * 120) paragraphs = [p.strip() for p in re.split(r'\n\s*\n', context) if p.strip()] if not paragraphs: paragraphs = context.split('\n') query_words = set(re.findall(r'\w+', query.lower())) scored_paras = [] for i, para in enumerate(paragraphs): para_words = set(re.findall(r'\w+', para.lower())) overlap = len(query_words.intersection(para_words)) score = overlap / max(len(query_words), 1) scored_paras.append((para, score, i)) scored_paras.sort(key=lambda x: x[1], reverse=True) for i, (para, score, para_idx) in enumerate(scored_paras[:top_n]): if i < len(docs): doc = docs[i] metadata = doc.metadata page_num = (metadata.get('page') or metadata.get('source_page') or metadata.get('page_number') or 'N/A') resource = (metadata.get('source') or metadata.get('filename') or metadata.get('file_name') or metadata.get('document') or 'Unknown') if isinstance(page_num, dict): page_num = page_num.get('page', 'N/A') if isinstance(resource, dict): resource = resource.get('source', 'Unknown') page_str = f"📍 Pg {page_num}" if page_num != 'N/A' else "📍 Pg ?" resource_str = f"📁 {os.path.basename(resource)}" if resource != 'Unknown' else "📁 Unknown" else: page_str = "📍 Pg ?" resource_str = "📁 Unknown" marker = "🔥 TOP" if i < 2 else "⭐ RELEVANT" score_pct = int(score * 100) display_context.extend([ f"\n{marker} [{score_pct}%] {page_str} | {resource_str}", para, "─" * 100 ]) if len(scored_paras) > top_n: display_context.append(f"\n... +{len(scored_paras)-top_n} more from other pages/resources") return "\n".join(display_context) #=========================================================================== def save_to_history(query, summary, context, docs, timestamp=None): """Save query to conversation history""" if timestamp is None: timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") history_entry = { "timestamp": timestamp, "query": query, "summary": summary, "context_preview": context[:200] + "..." if len(context) > 200 else context, "full_context_length": len(context), "retrieved_docs": len(docs), "top_resources": [os.path.basename(doc.metadata.get('source', 'Unknown')) for doc in docs[:3]], "avg_relevance_score": sum([float(doc.metadata.get('score', 0)) for doc in docs[:5]]) / max(1, len(docs)) } HISTORY.append(history_entry) # Keep last 50 entries if len(HISTORY) > 50: HISTORY.pop(0) print(f"💾 Saved to history #{len(HISTORY)}") #================================================================================ def show_history_compact(limit=3): """Compact history for embedding in results.""" if not HISTORY: return "No previous queries yet." output = "" for i, entry in enumerate(HISTORY[-limit:], 1): output += f"\n{i}. **{entry['query'][:50]}...** [{entry['timestamp'][:16]}]" output += f"\n 📄 {entry['retrieved_docs']} docs | {entry['top_resources'][0] if entry['top_resources'] else 'N/A'}" output += f"\n 💡 {entry['summary'][:60]}..." output += "\n" + "─" * 60 return output #========================optimized the question def reframe_question_with_history(user_question): # Reframing prompt reframe_prompt = f"""Generate a single, comprehensive question that best captures the information needed to address the user's query or intent and includes the context from the conversation history. User's question: {user_question} Only output the optimized question. OPTIMIZED QUESTION:""" # Use FLAN-T5 for reframing (lightweight) reframer = pipeline("text2text-generation", model="google/flan-t5-small", device_map="cpu") reframed = reframer( reframe_prompt, max_new_tokens=100, max_length=512, temperature=0.1, do_sample=False )[0]['generated_text'] # Extract just the question optimized_question = reframed.split("OPTIMIZED QUESTION:")[-1].strip() if not optimized_question or len(optimized_question) < 10: optimized_question = user_question # Fallback return optimized_question #========================main funcition-TAB1=========================== def summarize_with_flan_t5(query): user_repo_id = "manabb/nrl" msg="" """Generate bullet summary + context + HISTORY TRACKING.""" try: # REFRARE QUESTION WITH HISTORY #print("🔄 Reframing question with history...") optimized_query = reframe_question_with_history(query) msg=msg+" /n Your original querry : "+query msg=msg+" /n The optimized querry : "+optimized_query #print(f"📝 Original: {query}") #print(f"📝 Optimized: {optimized_query}") docs = retriever.invoke(optimized_query) #print(f"✅ Retrieved {len(docs)} docs") context = "\n".join([doc.page_content for doc in docs]) bullet_prompt = f"""Summarize as 4-6 bullet points: {context[:900]} Main Points:""" bullet_summary = summarizer(bullet_prompt, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] smart_context = intelligently_show_context_with_pages_resources(context, query, docs) # ✅ SAVE TO HISTORY save_to_history(query, bullet_summary, context, docs) # ✅ COMBINE HISTORY + CURRENT RESULT history_section = show_history_compact(limit=3) # Last 3 queries combined_result = f""" 🤖 **YOUR Querry: "{query}"** 📋 **SUMMARY:** {bullet_summary} 📄 **INTELLIGENT CONTEXT:** {smart_context} 📜 **RECENT HISTORY** (last 3 queries): {history_section}""" #return combined_result # Single output with everything! msg = msg+" \n "+ combined_result except Exception as e1: #print(f"❌ Error: {e1}") #return f"Error: {e1}", f"Error: {e1}" msg=f"Error: {e1}" finally: if os.path.exists("temp_faiss"): shutil.rmtree("temp_faiss") return msg #==============================Main Function end def login(user, pwd): if user == "785699" and pwd == "781005": return ( gr.update(visible=False),#loading_panel gr.update(visible=False),#login_panel gr.update(visible=True),#tabs_panel "✅ Login successful"#status ) return ( gr.update(visible=False),#loading_panel gr.update(visible=True),#login_panel gr.update(visible=False),#tabs_panel "❌ Invalid credentials"#status ) #================================ def load_resources(): time.sleep(3) # simulate FAISS / model loading return ( gr.update(visible=False), # hide loading gr.update(visible=True), # show login_panel gr.update(visible=False) # hide tabs ) #=====================================================TAB2 START==================================== #============================================= def create_faiss_index(repo_id, file, embedding_model="sentence-transformers/all-MiniLM-L6-v2"): """Create FAISS index from PDF and upload to HF dataset repo""" message = "Index creation started" try: # Step 1: Create proper embeddings object (CRITICAL FIX) embeddings = HuggingFaceEmbeddings(model_name=embedding_model) # Step 2: Clean temp directory if os.path.exists("temp_faiss"): shutil.rmtree("temp_faiss") # Step 3: Try PyPDFLoader first loader = PyPDFLoader(file) documents = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) new_docs = text_splitter.split_documents(documents) db = FAISS.from_documents(new_docs, embeddings) db.save_local("temp_faiss") # Step 4: Upload to HF Hub api = HfApi(token=os.getenv("HF_TOKEN")) api.upload_file(path_or_fileobj="temp_faiss/index.faiss", path_in_repo="index.faiss", repo_id=repo_id, repo_type="dataset") api.upload_file(path_or_fileobj="temp_faiss/index.pkl", path_in_repo="index.pkl", repo_id=repo_id, repo_type="dataset") message = "✅ Index created successfully with PyPDFLoader and uploaded to repo" except Exception as e1: try: print(f"PyPDFLoader failed: {e1}") # Step 5: Fallback to PyMuPDFLoader loader = PyMuPDFLoader(file) documents = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) new_docs = text_splitter.split_documents(documents) # Use same embeddings instance embeddings = HuggingFaceEmbeddings(model_name=embedding_model) db = FAISS.from_documents(new_docs, embeddings) db.save_local("temp_faiss") # Upload api = HfApi(token=os.getenv("HF_TOKEN")) api.upload_file(path_or_fileobj="temp_faiss/index.faiss", path_in_repo="index.faiss", repo_id=repo_id, repo_type="dataset") api.upload_file(path_or_fileobj="temp_faiss/index.pkl", path_in_repo="index.pkl", repo_id=repo_id, repo_type="dataset") message = f"✅ PyPDFLoader failed ({e1}), PyMuPDFLoader succeeded and uploaded to repo" except Exception as e2: message = f"❌ Both loaders failed. PyPDF: {e1}, PyMuPDF: {e2}" finally: # Cleanup if os.path.exists("temp_faiss"): shutil.rmtree("temp_faiss") return message # Usage #result = create_faiss_index("your_username/your-dataset", "path/to/your/file.pdf") #print(result) #============= def update_faiss_from_hf(repo_id, file, embedding_model="sentence-transformers/all-MiniLM-L6-v2"): """Load existing FAISS from HF, add new docs, push updated version.""" message = "" try: # Step 1: Create embeddings embeddings = HuggingFaceEmbeddings(model_name=embedding_model) # Step 2: Download existing FAISS files print("Downloading existing FAISS index...") faiss_path = hf_hub_download(repo_id=repo_id, filename="index.faiss", repo_type="dataset") pkl_path = hf_hub_download(repo_id=repo_id, filename="index.pkl", repo_type="dataset") # Step 3: Load existing vectorstore folder_path = os.path.dirname(faiss_path) vectorstore = FAISS.load_local( folder_path=folder_path, embeddings=embeddings, allow_dangerous_deserialization=True ) message += f"✅ Loaded existing index with {vectorstore.index.ntotal} vectors\n" # Step 4: Load new document with fallback documents = None loaders = [ ("PyPDFLoader", PyPDFLoader), ("PyMuPDFLoader", PyMuPDFLoader) ] for loader_name, LoaderClass in loaders: try: print(f"Trying {loader_name}...") loader = LoaderClass(file) documents = loader.load() message += f"✅ Loaded {len(documents)} pages with {loader_name}\n" break except Exception as e: message += f"❌ {loader_name} failed: {str(e)[:100]}...\n" continue if documents is None: return "❌ All PDF loaders failed" # Step 5: Split documents text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) new_docs = text_splitter.split_documents(documents) message += f"✅ Created {len(new_docs)} chunks from new document\n" # Step 6: Add new documents to existing index vectorstore.add_documents(new_docs) message += f"✅ Added to index. New total: {vectorstore.index.ntotal} vectors\n" # Step 7: Save updated index temp_dir = "temp_faiss_update" if os.path.exists(temp_dir): shutil.rmtree(temp_dir) vectorstore.save_local(temp_dir) # Step 8: Upload updated files api = HfApi(token=os.getenv("HF_TOKEN")) # Replace with your token api.upload_file( path_or_fileobj=f"{temp_dir}/index.faiss", path_in_repo="index.faiss", repo_id=repo_id, repo_type="dataset" ) api.upload_file( path_or_fileobj=f"{temp_dir}/index.pkl", path_in_repo="index.pkl", repo_id=repo_id, repo_type="dataset" ) message += f"✅ Successfully updated repo with {len(new_docs)} new chunks!" except Exception as e: message += f"❌ Update failed: {str(e)}" finally: # Cleanup if os.path.exists("temp_faiss_update"): shutil.rmtree("temp_faiss_update") return message # Usage # result = update_faiss_from_hf("yourusername/my-faiss-store", "new_document.pdf") # print(result) #==================== def upload_and_prepare(file,user): # Load & split document mm="" if user == os.getenv("uploading_password"): if file_exists(repo_id=repo_id, filename="index.faiss", repo_type="dataset"): mm=update_faiss_from_hf(repo_id, file) #mm="✅ Document processed. New index added. You can now ask questions!" if not file_exists(repo_id=repo_id, filename="index.faiss", repo_type="dataset"): mm=create_faiss_index(repo_id, file) #mm="✅ Document processed. New index created. You can now ask questions!" else: mm="❌ Unauthorized User" return mm #create_faiss_index(repo_id, file_input) #========================================TAB2 END===================================================== #=============================================================================================gradio with gr.Blocks() as demo: status = gr.Markdown("# 🚀 NRL AI Space for commercial department - Guwahati") # ---- Loading Screen ---- with gr.Column(visible=True) as loading_panel: gr.Markdown("⏳ Loading resources, please wait...") with gr.Column(visible=False) as login_panel: user = gr.Textbox(label="Username", placeholder="hint:Pin code of the location where our refinery is") pwd = gr.Textbox(label="Password", type="password", placeholder="hint:Pin code of the location where our corporate office is") login_btn = gr.Button("Login") # ---- Tabs Container (initially hidden) ---- with gr.Column(visible=False) as tabs_panel: with gr.Tab("📄 ASK on manual of procurement of Goods"): answer_output1 = gr.Textbox(label="✅ Answer", lines=10, interactive=True) query_input1 = gr.Textbox(label="❓ Your Question pls", placeholder="e.g., What is Gem?") query_btn1 = gr.Button("🧠 Get Answer", variant="primary") query_btn1.click( fn=summarize_with_flan_t5, inputs=query_input1, outputs=answer_output1 # answers with bullet, smart context and history ) with gr.Tab("Upload PDF and create FAISS"): gr.Markdown("## 🧠 For uploading new PDF documents.") output_msg = gr.Textbox(label="📁 Authorization Message", interactive=False) file_input = gr.File(label="📄 Upload .pdf File by only authorized user", type="filepath") upload_btn = gr.Button("🔄 Process Doc") authorized_user=gr.Textbox(label="Write the password to upload new Circular Doc.") upload_btn.click(upload_and_prepare, inputs=[file_input, authorized_user], outputs=output_msg) with gr.Tab("📊 Upcoming functionality-2"): gr.Textbox(label="Coming soon") with gr.Tab("📊 Upcoming functionality-3"): gr.Textbox(label="Coming soon") # Auto-trigger loading after app starts demo.load( load_resources, outputs=[loading_panel, login_panel, tabs_panel] ) login_btn.click( login, inputs=[user, pwd], outputs=[loading_panel, login_panel, tabs_panel, status] ) demo.launch()