Spaces:
Sleeping
Sleeping
| #R&D | |
| import gradio as gr | |
| import time | |
| #correct with history | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from transformers import pipeline | |
| import shutil | |
| import re | |
| import json | |
| from datetime import datetime | |
| from collections import deque | |
| user_repo_id = "manabb/nrl" | |
| msg = "" | |
| # History storage | |
| HISTORY = [] | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| faiss_path = hf_hub_download(repo_id=user_repo_id, filename="index.faiss", repo_type="dataset") | |
| pkl_path = hf_hub_download(repo_id=user_repo_id, filename="index.pkl", repo_type="dataset") | |
| folder_path = os.path.dirname(faiss_path) | |
| vectorstore = FAISS.load_local(folder_path, embeddings, allow_dangerous_deserialization=True) | |
| print(f"✅ Vectorstore: {vectorstore.index.ntotal} docs") | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": 10}) | |
| summarizer = pipeline("summarization", model="google/flan-t5-small", device_map="cpu") | |
| #=========================================TAB-1-START====================================== | |
| def intelligently_show_context_with_pages_resources(context, query, docs, top_n=3): | |
| """ Intelligently extract paragraphs with PAGE NUMBERS + RESOURCE names """ | |
| display_context = [] | |
| display_context.append("📄 ****\n") | |
| display_context.append("=" * 120) | |
| paragraphs = [p.strip() for p in re.split(r'\n\s*\n', context) if p.strip()] | |
| if not paragraphs: | |
| paragraphs = context.split('\n') | |
| query_words = set(re.findall(r'\w+', query.lower())) | |
| scored_paras = [] | |
| for i, para in enumerate(paragraphs): | |
| para_words = set(re.findall(r'\w+', para.lower())) | |
| overlap = len(query_words.intersection(para_words)) | |
| score = overlap / max(len(query_words), 1) | |
| scored_paras.append((para, score, i)) | |
| scored_paras.sort(key=lambda x: x[1], reverse=True) | |
| for i, (para, score, para_idx) in enumerate(scored_paras[:top_n]): | |
| if i < len(docs): | |
| doc = docs[i] | |
| metadata = doc.metadata | |
| page_num = (metadata.get('page') or metadata.get('source_page') or | |
| metadata.get('page_number') or 'N/A') | |
| resource = (metadata.get('source') or metadata.get('filename') or | |
| metadata.get('file_name') or metadata.get('document') or 'Unknown') | |
| if isinstance(page_num, dict): page_num = page_num.get('page', 'N/A') | |
| if isinstance(resource, dict): resource = resource.get('source', 'Unknown') | |
| page_str = f"📍 Pg {page_num}" if page_num != 'N/A' else "📍 Pg ?" | |
| resource_str = f"📁 {os.path.basename(resource)}" if resource != 'Unknown' else "📁 Unknown" | |
| else: | |
| page_str = "📍 Pg ?" | |
| resource_str = "📁 Unknown" | |
| marker = "🔥 TOP" if i < 2 else "⭐ RELEVANT" | |
| score_pct = int(score * 100) | |
| display_context.extend([ | |
| f"\n{marker} [{score_pct}%] {page_str} | {resource_str}", | |
| para, | |
| "─" * 100 | |
| ]) | |
| if len(scored_paras) > top_n: | |
| display_context.append(f"\n... +{len(scored_paras)-top_n} more from other pages/resources") | |
| return "\n".join(display_context) | |
| #=========================================================================== | |
| def save_to_history(query, summary, context, docs, timestamp=None): | |
| """Save query to conversation history""" | |
| if timestamp is None: | |
| timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| history_entry = { | |
| "timestamp": timestamp, | |
| "query": query, | |
| "summary": summary, | |
| "context_preview": context[:200] + "..." if len(context) > 200 else context, | |
| "full_context_length": len(context), | |
| "retrieved_docs": len(docs), | |
| "top_resources": [os.path.basename(doc.metadata.get('source', 'Unknown')) for doc in docs[:3]], | |
| "avg_relevance_score": sum([float(doc.metadata.get('score', 0)) for doc in docs[:5]]) / max(1, len(docs)) | |
| } | |
| HISTORY.append(history_entry) | |
| # Keep last 50 entries | |
| if len(HISTORY) > 50: | |
| HISTORY.pop(0) | |
| print(f"💾 Saved to history #{len(HISTORY)}") | |
| #================================================================================ | |
| def show_history_compact(limit=3): | |
| """Compact history for embedding in results.""" | |
| if not HISTORY: | |
| return "No previous queries yet." | |
| output = "" | |
| for i, entry in enumerate(HISTORY[-limit:], 1): | |
| output += f"\n{i}. **{entry['query'][:50]}...** [{entry['timestamp'][:16]}]" | |
| output += f"\n 📄 {entry['retrieved_docs']} docs | {entry['top_resources'][0] if entry['top_resources'] else 'N/A'}" | |
| output += f"\n 💡 {entry['summary'][:60]}..." | |
| output += "\n" + "─" * 60 | |
| return output | |
| #========================optimized the question | |
| def reframe_question_with_history(user_question): | |
| # Reframing prompt | |
| reframe_prompt = f"""Generate a single, comprehensive question that best captures the information needed to address the user's query or intent and includes the context from the conversation history. | |
| User's question: {user_question} | |
| Only output the optimized question. | |
| OPTIMIZED QUESTION:""" | |
| # Use FLAN-T5 for reframing (lightweight) | |
| reframer = pipeline("text2text-generation", model="google/flan-t5-small", device_map="cpu") | |
| reframed = reframer( | |
| reframe_prompt, | |
| max_new_tokens=100, | |
| max_length=512, | |
| temperature=0.1, | |
| do_sample=False | |
| )[0]['generated_text'] | |
| # Extract just the question | |
| optimized_question = reframed.split("OPTIMIZED QUESTION:")[-1].strip() | |
| if not optimized_question or len(optimized_question) < 10: | |
| optimized_question = user_question # Fallback | |
| return optimized_question | |
| #========================main funcition-TAB1=========================== | |
| def summarize_with_flan_t5(query): | |
| user_repo_id = "manabb/nrl" | |
| msg="" | |
| """Generate bullet summary + context + HISTORY TRACKING.""" | |
| try: | |
| # REFRARE QUESTION WITH HISTORY | |
| #print("🔄 Reframing question with history...") | |
| optimized_query = reframe_question_with_history(query) | |
| msg=msg+" /n Your original querry : "+query | |
| msg=msg+" /n The optimized querry : "+optimized_query | |
| #print(f"📝 Original: {query}") | |
| #print(f"📝 Optimized: {optimized_query}") | |
| docs = retriever.invoke(optimized_query) | |
| #print(f"✅ Retrieved {len(docs)} docs") | |
| context = "\n".join([doc.page_content for doc in docs]) | |
| bullet_prompt = f"""Summarize as 4-6 bullet points: | |
| {context[:900]} | |
| Main Points:""" | |
| bullet_summary = summarizer(bullet_prompt, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] | |
| smart_context = intelligently_show_context_with_pages_resources(context, query, docs) | |
| # ✅ SAVE TO HISTORY | |
| save_to_history(query, bullet_summary, context, docs) | |
| # ✅ COMBINE HISTORY + CURRENT RESULT | |
| history_section = show_history_compact(limit=3) # Last 3 queries | |
| combined_result = f""" | |
| 🤖 **YOUR Querry: "{query}"** | |
| 📋 **SUMMARY:** | |
| {bullet_summary} | |
| 📄 **INTELLIGENT CONTEXT:** | |
| {smart_context} | |
| 📜 **RECENT HISTORY** (last 3 queries): | |
| {history_section}""" | |
| #return combined_result # Single output with everything! | |
| msg = msg+" \n "+ combined_result | |
| except Exception as e1: | |
| #print(f"❌ Error: {e1}") | |
| #return f"Error: {e1}", f"Error: {e1}" | |
| msg=f"Error: {e1}" | |
| finally: | |
| if os.path.exists("temp_faiss"): | |
| shutil.rmtree("temp_faiss") | |
| return msg | |
| #==============================Main Function end | |
| def login(user, pwd): | |
| if user == "785699" and pwd == "781005": | |
| return ( | |
| gr.update(visible=False),#loading_panel | |
| gr.update(visible=False),#login_panel | |
| gr.update(visible=True),#tabs_panel | |
| "✅ Login successful"#status | |
| ) | |
| return ( | |
| gr.update(visible=False),#loading_panel | |
| gr.update(visible=True),#login_panel | |
| gr.update(visible=False),#tabs_panel | |
| "❌ Invalid credentials"#status | |
| ) | |
| #================================ | |
| def load_resources(): | |
| time.sleep(3) # simulate FAISS / model loading | |
| return ( | |
| gr.update(visible=False), # hide loading | |
| gr.update(visible=True), # show login_panel | |
| gr.update(visible=False) # hide tabs | |
| ) | |
| #=====================================================TAB2 START==================================== | |
| #============================================= | |
| def create_faiss_index(repo_id, file, embedding_model="sentence-transformers/all-MiniLM-L6-v2"): | |
| """Create FAISS index from PDF and upload to HF dataset repo""" | |
| message = "Index creation started" | |
| try: | |
| # Step 1: Create proper embeddings object (CRITICAL FIX) | |
| embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
| # Step 2: Clean temp directory | |
| if os.path.exists("temp_faiss"): | |
| shutil.rmtree("temp_faiss") | |
| # Step 3: Try PyPDFLoader first | |
| loader = PyPDFLoader(file) | |
| documents = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| new_docs = text_splitter.split_documents(documents) | |
| db = FAISS.from_documents(new_docs, embeddings) | |
| db.save_local("temp_faiss") | |
| # Step 4: Upload to HF Hub | |
| api = HfApi(token=os.getenv("HF_TOKEN")) | |
| api.upload_file(path_or_fileobj="temp_faiss/index.faiss", path_in_repo="index.faiss", repo_id=repo_id, repo_type="dataset") | |
| api.upload_file(path_or_fileobj="temp_faiss/index.pkl", path_in_repo="index.pkl", repo_id=repo_id, repo_type="dataset") | |
| message = "✅ Index created successfully with PyPDFLoader and uploaded to repo" | |
| except Exception as e1: | |
| try: | |
| print(f"PyPDFLoader failed: {e1}") | |
| # Step 5: Fallback to PyMuPDFLoader | |
| loader = PyMuPDFLoader(file) | |
| documents = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| new_docs = text_splitter.split_documents(documents) | |
| # Use same embeddings instance | |
| embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
| db = FAISS.from_documents(new_docs, embeddings) | |
| db.save_local("temp_faiss") | |
| # Upload | |
| api = HfApi(token=os.getenv("HF_TOKEN")) | |
| api.upload_file(path_or_fileobj="temp_faiss/index.faiss", path_in_repo="index.faiss", repo_id=repo_id, repo_type="dataset") | |
| api.upload_file(path_or_fileobj="temp_faiss/index.pkl", path_in_repo="index.pkl", repo_id=repo_id, repo_type="dataset") | |
| message = f"✅ PyPDFLoader failed ({e1}), PyMuPDFLoader succeeded and uploaded to repo" | |
| except Exception as e2: | |
| message = f"❌ Both loaders failed. PyPDF: {e1}, PyMuPDF: {e2}" | |
| finally: | |
| # Cleanup | |
| if os.path.exists("temp_faiss"): | |
| shutil.rmtree("temp_faiss") | |
| return message | |
| # Usage | |
| #result = create_faiss_index("your_username/your-dataset", "path/to/your/file.pdf") | |
| #print(result) | |
| #============= | |
| def update_faiss_from_hf(repo_id, file, embedding_model="sentence-transformers/all-MiniLM-L6-v2"): | |
| """Load existing FAISS from HF, add new docs, push updated version.""" | |
| message = "" | |
| try: | |
| # Step 1: Create embeddings | |
| embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
| # Step 2: Download existing FAISS files | |
| print("Downloading existing FAISS index...") | |
| faiss_path = hf_hub_download(repo_id=repo_id, filename="index.faiss", repo_type="dataset") | |
| pkl_path = hf_hub_download(repo_id=repo_id, filename="index.pkl", repo_type="dataset") | |
| # Step 3: Load existing vectorstore | |
| folder_path = os.path.dirname(faiss_path) | |
| vectorstore = FAISS.load_local( | |
| folder_path=folder_path, | |
| embeddings=embeddings, | |
| allow_dangerous_deserialization=True | |
| ) | |
| message += f"✅ Loaded existing index with {vectorstore.index.ntotal} vectors\n" | |
| # Step 4: Load new document with fallback | |
| documents = None | |
| loaders = [ | |
| ("PyPDFLoader", PyPDFLoader), | |
| ("PyMuPDFLoader", PyMuPDFLoader) | |
| ] | |
| for loader_name, LoaderClass in loaders: | |
| try: | |
| print(f"Trying {loader_name}...") | |
| loader = LoaderClass(file) | |
| documents = loader.load() | |
| message += f"✅ Loaded {len(documents)} pages with {loader_name}\n" | |
| break | |
| except Exception as e: | |
| message += f"❌ {loader_name} failed: {str(e)[:100]}...\n" | |
| continue | |
| if documents is None: | |
| return "❌ All PDF loaders failed" | |
| # Step 5: Split documents | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| new_docs = text_splitter.split_documents(documents) | |
| message += f"✅ Created {len(new_docs)} chunks from new document\n" | |
| # Step 6: Add new documents to existing index | |
| vectorstore.add_documents(new_docs) | |
| message += f"✅ Added to index. New total: {vectorstore.index.ntotal} vectors\n" | |
| # Step 7: Save updated index | |
| temp_dir = "temp_faiss_update" | |
| if os.path.exists(temp_dir): | |
| shutil.rmtree(temp_dir) | |
| vectorstore.save_local(temp_dir) | |
| # Step 8: Upload updated files | |
| api = HfApi(token=os.getenv("HF_TOKEN")) # Replace with your token | |
| api.upload_file( | |
| path_or_fileobj=f"{temp_dir}/index.faiss", | |
| path_in_repo="index.faiss", | |
| repo_id=repo_id, | |
| repo_type="dataset" | |
| ) | |
| api.upload_file( | |
| path_or_fileobj=f"{temp_dir}/index.pkl", | |
| path_in_repo="index.pkl", | |
| repo_id=repo_id, | |
| repo_type="dataset" | |
| ) | |
| message += f"✅ Successfully updated repo with {len(new_docs)} new chunks!" | |
| except Exception as e: | |
| message += f"❌ Update failed: {str(e)}" | |
| finally: | |
| # Cleanup | |
| if os.path.exists("temp_faiss_update"): | |
| shutil.rmtree("temp_faiss_update") | |
| return message | |
| # Usage | |
| # result = update_faiss_from_hf("yourusername/my-faiss-store", "new_document.pdf") | |
| # print(result) | |
| #==================== | |
| def upload_and_prepare(file,user): | |
| # Load & split document | |
| mm="" | |
| if user == os.getenv("uploading_password"): | |
| if file_exists(repo_id=repo_id, filename="index.faiss", repo_type="dataset"): | |
| mm=update_faiss_from_hf(repo_id, file) | |
| #mm="✅ Document processed. New index added. You can now ask questions!" | |
| if not file_exists(repo_id=repo_id, filename="index.faiss", repo_type="dataset"): | |
| mm=create_faiss_index(repo_id, file) | |
| #mm="✅ Document processed. New index created. You can now ask questions!" | |
| else: | |
| mm="❌ Unauthorized User" | |
| return mm | |
| #create_faiss_index(repo_id, file_input) | |
| #========================================TAB2 END===================================================== | |
| #=============================================================================================gradio | |
| with gr.Blocks() as demo: | |
| status = gr.Markdown("# 🚀 NRL AI Space for commercial department - Guwahati") | |
| # ---- Loading Screen ---- | |
| with gr.Column(visible=True) as loading_panel: | |
| gr.Markdown("⏳ Loading resources, please wait...") | |
| with gr.Column(visible=False) as login_panel: | |
| user = gr.Textbox(label="Username", placeholder="hint:Pin code of the location where our refinery is") | |
| pwd = gr.Textbox(label="Password", type="password", placeholder="hint:Pin code of the location where our corporate office is") | |
| login_btn = gr.Button("Login") | |
| # ---- Tabs Container (initially hidden) ---- | |
| with gr.Column(visible=False) as tabs_panel: | |
| with gr.Tab("📄 ASK on manual of procurement of Goods"): | |
| answer_output1 = gr.Textbox(label="✅ Answer", lines=10, interactive=True) | |
| query_input1 = gr.Textbox(label="❓ Your Question pls", placeholder="e.g., What is Gem?") | |
| query_btn1 = gr.Button("🧠 Get Answer", variant="primary") | |
| query_btn1.click( | |
| fn=summarize_with_flan_t5, | |
| inputs=query_input1, | |
| outputs=answer_output1 # answers with bullet, smart context and history | |
| ) | |
| with gr.Tab("Upload PDF and create FAISS"): | |
| gr.Markdown("## 🧠 For uploading new PDF documents.") | |
| output_msg = gr.Textbox(label="📁 Authorization Message", interactive=False) | |
| file_input = gr.File(label="📄 Upload .pdf File by only authorized user", type="filepath") | |
| upload_btn = gr.Button("🔄 Process Doc") | |
| authorized_user=gr.Textbox(label="Write the password to upload new Circular Doc.") | |
| upload_btn.click(upload_and_prepare, inputs=[file_input, authorized_user], outputs=output_msg) | |
| with gr.Tab("📊 Upcoming functionality-2"): | |
| gr.Textbox(label="Coming soon") | |
| with gr.Tab("📊 Upcoming functionality-3"): | |
| gr.Textbox(label="Coming soon") | |
| # Auto-trigger loading after app starts | |
| demo.load( | |
| load_resources, | |
| outputs=[loading_panel, login_panel, tabs_panel] | |
| ) | |
| login_btn.click( | |
| login, | |
| inputs=[user, pwd], | |
| outputs=[loading_panel, login_panel, tabs_panel, status] | |
| ) | |
| demo.launch() | |