Spaces:

ldostadi
/

anycoder-7ef6bbc0

Running

App Files Files Community

anycoder-7ef6bbc0 / app.py

ldostadi

Update app.py from anycoder

41fb074 verified 5 months ago

raw

history blame contribute delete

20.7 kB

	import gradio as gr
	import os
	import sqlite3
	import json
	import hashlib
	from datetime import datetime
	from typing import List, Dict, Any, Tuple, Optional
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import threading

	from utils import (
	process_document,
	extract_axioms,
	generate_response,
	get_embedding,
	compute_similarity,
	Document,
	Axiom,
	ActivityLog
	)

	# Initialize database
	DB_PATH = "rag_nexus.db"
	conn = sqlite3.connect(DB_PATH, check_same_thread=False)
	cursor = conn.cursor()

	# Create tables
	cursor.execute("""
	CREATE TABLE IF NOT EXISTS documents (
	id TEXT PRIMARY KEY,
	name TEXT,
	content TEXT,
	size INTEGER,
	uploaded_at TEXT,
	chunk_count INTEGER
	)
	""")

	cursor.execute("""
	CREATE TABLE IF NOT EXISTS axioms (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	doc_id TEXT,
	source TEXT,
	axiom TEXT,
	confidence REAL,
	FOREIGN KEY (doc_id) REFERENCES documents (id)
	)
	""")

	cursor.execute("""
	CREATE TABLE IF NOT EXISTS activity (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	action TEXT,
	details TEXT,
	timestamp TEXT
	)
	""")

	conn.commit()

	# Thread-local storage for database connections
	thread_local = threading.local()

	def get_db():
	"""Get thread-local database connection"""
	if not hasattr(thread_local, 'conn'):
	thread_local.conn = sqlite3.connect(DB_PATH)
	return thread_local.conn

	class RAGState:
	def __init__(self):
	self.vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
	self.document_chunks = []
	self.chunk_metadata = []
	self.is_initialized = False

	def initialize_models(self):
	"""Initialize models (simulated)"""
	if not self.is_initialized:
	# Load existing documents
	conn = get_db()
	cursor = conn.cursor()
	cursor.execute("SELECT id, content FROM documents")
	docs = cursor.fetchall()

	if docs:
	chunks = []
	metadata = []
	for doc_id, content in docs:
	doc_chunks = [content[i:i+500] for i in range(0, len(content), 500)]
	chunks.extend(doc_chunks)
	metadata.extend([{"doc_id": doc_id, "chunk_idx": i} for i in range(len(doc_chunks))])

	if chunks:
	self.vectorizer.fit(chunks)
	self.document_chunks = chunks
	self.chunk_metadata = metadata

	self.is_initialized = True

	def get_state():
	"""Get global state"""
	if not hasattr(get_state, 'state'):
	get_state.state = RAGState()
	return get_state.state

	def log_activity(action: str, details: Dict[str, Any]):
	"""Log activity to database"""
	conn = get_db()
	cursor = conn.cursor()
	cursor.execute(
	"INSERT INTO activity (action, details, timestamp) VALUES (?, ?, ?)",
	(action, json.dumps(details), datetime.now().isoformat())
	)
	conn.commit()

	def get_stats():
	"""Get system statistics"""
	conn = get_db()
	cursor = conn.cursor()

	cursor.execute("SELECT COUNT(*) FROM documents")
	doc_count = cursor.fetchone()[0]

	cursor.execute("SELECT COUNT(*) FROM axioms")
	axiom_count = cursor.fetchone()[0]

	cursor.execute("SELECT SUM(size) FROM documents")
	storage = cursor.fetchone()[0] or 0

	return {
	"doc_count": doc_count,
	"axiom_count": axiom_count,
	"storage_mb": round(storage / 1024 / 1024, 2)
	}

	def load_documents():
	"""Load all documents"""
	conn = get_db()
	cursor = conn.cursor()
	cursor.execute("SELECT id, name, size, uploaded_at FROM documents ORDER BY uploaded_at DESC")
	docs = cursor.fetchall()

	if not docs:
	return [["No documents found", "", "", ""]]

	return [[doc[1], f"{doc[2]} bytes", doc[3], doc[0]] for doc in docs]

	def load_axioms(source_filter: str = ""):
	"""Load axioms with optional source filter"""
	conn = get_db()
	cursor = conn.cursor()

	if source_filter:
	cursor.execute("""
	SELECT a.id, a.source, a.axiom, a.confidence, d.name
	FROM axioms a
	JOIN documents d ON a.doc_id = d.id
	WHERE d.name LIKE ?
	ORDER BY a.confidence DESC
	""", (f"%{source_filter}%",))
	else:
	cursor.execute("""
	SELECT a.id, a.source, a.axiom, a.confidence, d.name
	FROM axioms a
	JOIN documents d ON a.doc_id = d.id
	ORDER BY a.confidence DESC
	""")

	axioms = cursor.fetchall()

	if not axioms:
	return [["No axioms found", "", "", "", ""]]

	return [[ax[4], ax[1], ax[2][:100] + "...", f"{ax[3]:.2f}", str(ax[0])] for ax in axioms]

	def load_activity():
	"""Load recent activity"""
	conn = get_db()
	cursor = conn.cursor()
	cursor.execute("SELECT action, details, timestamp FROM activity ORDER BY timestamp DESC LIMIT 20")
	activities = cursor.fetchall()

	if not activities:
	return [["No activity yet", "", ""]]

	return [[act[0], json.loads(act[1]).get('description', ''), act[2]] for act in activities]

	def process_uploaded_files(files: List[str]) -> Tuple[str, str]:
	"""Process uploaded files and return status"""
	if not files:
	return "No files uploaded", "⚠️"

	state = get_state()
	success_count = 0
	total_count = len(files)

	for file_path in files:
	try:
	# Process document
	doc = process_document(file_path)

	# Save to database
	conn = get_db()
	cursor = conn.cursor()
	cursor.execute(
	"INSERT INTO documents (id, name, content, size, uploaded_at, chunk_count) VALUES (?, ?, ?, ?, ?, ?)",
	(doc.id, doc.name, doc.content, doc.size, doc.uploaded_at, doc.chunk_count)
	)

	# Extract axioms
	axioms = extract_axioms(doc.content, doc.id)
	for axiom in axioms:
	cursor.execute(
	"INSERT INTO axioms (doc_id, source, axiom, confidence) VALUES (?, ?, ?, ?)",
	(doc.id, axiom.source, axiom.text, axiom.confidence)
	)

	conn.commit()

	# Update vector store
	chunks = [doc.content[i:i+500] for i in range(0, len(doc.content), 500)]
	state.document_chunks.extend(chunks)
	state.chunk_metadata.extend([{"doc_id": doc.id, "chunk_idx": i} for i in range(len(chunks))])

	# Refit vectorizer if needed
	if state.document_chunks:
	state.vectorizer.fit(state.document_chunks)

	log_activity("document_uploaded", {
	"name": doc.name,
	"size": doc.size,
	"chunks": doc.chunk_count
	})

	success_count += 1

	except Exception as e:
	log_activity("upload_failed", {
	"file": os.path.basename(file_path),
	"error": str(e)
	})

	# Clean up temporary files
	for file_path in files:
	try:
	os.unlink(file_path)
	except:
	pass

	return f"Processed {success_count}/{total_count} files", "✅" if success_count == total_count else "⚠️"

	def generate_rag_response(query: str, use_axioms: bool, use_context: bool) -> Tuple[str, str]:
	"""Generate response using RAG"""
	if not query.strip():
	return "Please enter a query", ""

	state = get_state()
	state.initialize_models()

	# Retrieve context
	context = ""
	retrieved_docs = []

	if use_context and state.document_chunks:
	try:
	query_vec = state.vectorizer.transform([query])
	doc_vecs = state.vectorizer.transform(state.document_chunks)
	similarities = cosine_similarity(query_vec, doc_vecs).flatten()

	# Get top 3 chunks
	top_indices = np.argsort(similarities)[-3:][::-1]

	for idx in top_indices:
	if similarities[idx] > 0.1:
	chunk = state.document_chunks[idx]
	doc_id = state.chunk_metadata[idx]["doc_id"]
	conn = get_db()
	cursor = conn.cursor()
	cursor.execute("SELECT name FROM documents WHERE id = ?", (doc_id,))
	doc_name = cursor.fetchone()[0]

	context += f"\n\n--- From {doc_name} ---\n{chunk}"
	retrieved_docs.append(f"{doc_name} (similarity: {similarities[idx]:.2f})")
	except:
	context = ""
	retrieved_docs = ["No relevant context found"]

	# Get axioms
	axioms = []
	if use_axioms:
	conn = get_db()
	cursor = conn.cursor()
	cursor.execute("SELECT axiom FROM axioms ORDER BY RANDOM() LIMIT 5")
	axioms = [row[0] for row in cursor.fetchall()]

	# Generate response
	response = generate_response(query, context, axioms)

	# Log activity
	log_activity("response_generated", {
	"query": query[:100],
	"used_axioms": use_axioms,
	"used_context": use_context
	})

	# Format context info
	context_info = "\n".join(retrieved_docs) if retrieved_docs else "No context retrieved"

	return response, context_info

	def clear_all_data():
	"""Clear all data from database"""
	conn = get_db()
	cursor = conn.cursor()
	cursor.execute("DELETE FROM documents")
	cursor.execute("DELETE FROM axioms")
	cursor.execute("DELETE FROM activity")
	conn.commit()

	# Reset state
	state = get_state()
	state.document_chunks = []
	state.chunk_metadata = []

	log_activity("data_cleared", {"all": True})

	return "All data cleared successfully", "✅"

	def export_axioms():
	"""Export axioms as JSON"""
	conn = get_db()
	cursor = conn.cursor()
	cursor.execute("""
	SELECT d.name as document, a.source, a.axiom, a.confidence
	FROM axioms a
	JOIN documents d ON a.doc_id = d.id
	""")
	axioms = [{"document": row[0], "source": row[1], "axiom": row[2], "confidence": row[3]}
	for row in cursor.fetchall()]

	if not axioms:
	return "No axioms to export", "⚠️"

	filename = f"axioms_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
	with open(filename, 'w') as f:
	json.dump(axioms, f, indent=2)

	log_activity("axioms_exported", {"count": len(axioms), "file": filename})

	return f"Exported {len(axioms)} axioms to {filename}", "✅"

	# Initialize app state on load
	def initialize_app():
	state = get_state()
	state.initialize_models()
	return "✅ Models initialized"

	# Create Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# 🔮 RAG Nexus
	### Intelligent Document Analysis & Axiom Extraction System
	Built with anycoder \| [View on Hugging Face](https://huggingface.co/spaces/akhaliq/anycoder)
	"""
	)

	# Status bar
	with gr.Row():
	status_text = gr.Textbox("Initializing...", label="System Status", scale=4)
	init_btn = gr.Button("🔄 Reinitialize", scale=1)

	# Tabs
	with gr.Tabs() as tabs:
	# Upload Tab
	with gr.TabItem("📤 Upload", id="upload"):
	gr.Markdown("### Upload Documents for Analysis")

	file_output = gr.File(
	label="Drop files here or click to browse",
	file_count="multiple",
	file_types=[".txt", ".md", ".pdf", ".doc", ".docx"]
	)

	upload_btn = gr.Button("🚀 Process Files", variant="primary")
	upload_status = gr.Textbox(label="Upload Status", interactive=False)

	with gr.Accordion("📋 Upload Queue", open=False):
	upload_queue = gr.Dataframe(
	headers=["File", "Status", "Size (bytes)"],
	datatype=["str", "str", "number"],
	label="Processed Files"
	)

	# Documents Tab
	with gr.TabItem("📚 Documents", id="documents"):
	gr.Markdown("### Indexed Documents")

	with gr.Row():
	doc_search = gr.Textbox(
	placeholder="Search documents...",
	label="Search",
	scale=3
	)
	clear_docs_btn = gr.Button("🗑️ Clear All", variant="stop", scale=1)

	documents_table = gr.Dataframe(
	headers=["Name", "Size", "Uploaded", "ID"],
	datatype=["str", "str", "str", "str"],
	label="Documents",
	wrap=True
	)

	doc_search.change(
	fn=lambda search: load_documents(),
	inputs=doc_search,
	outputs=documents_table,
	api_visibility="private"
	)

	# Axioms Tab
	with gr.TabItem("⚡ Axioms", id="axioms"):
	gr.Markdown("### Extracted Axioms")

	with gr.Row():
	axiom_search = gr.Textbox(
	placeholder="Search axioms...",
	label="Search",
	scale=2
	)
	axiom_filter = gr.Dropdown(
	choices=[],
	label="Filter by Document",
	scale=1
	)
	export_axioms_btn = gr.Button("💾 Export JSON", scale=1)

	axioms_table = gr.Dataframe(
	headers=["Document", "Source", "Axiom", "Confidence", "ID"],
	datatype=["str", "str", "str", "number", "str"],
	label="Axioms",
	wrap=True
	)

	export_status = gr.Textbox(label="Export Status", interactive=False)

	# Generate Tab
	with gr.TabItem("🤖 Generate", id="generate"):
	gr.Markdown("### Intelligent Response Generation")

	query_input = gr.Textbox(
	label="Enter your query",
	placeholder="Ask anything about your documents... (e.g., 'What are the fundamental principles based on the uploaded documents?')",
	lines=4,
	max_lines=8
	)

	with gr.Row():
	use_axioms = gr.Checkbox(label="Use Axioms", value=True)
	use_context = gr.Checkbox(label="Use Context (RAG)", value=True)

	generate_btn = gr.Button("🚀 Generate Response", variant="primary")

	with gr.Group():
	response_output = gr.Markdown(
	label="Generated Response",
	show_copy_button=True
	)

	with gr.Accordion("📚 Retrieved Context & Axioms", open=False):
	context_output = gr.Textbox(
	label="Retrieved Documents",
	lines=5,
	interactive=False
	)

	query_stats = gr.Textbox(
	label="Query Statistics",
	interactive=False,
	visible=False
	)

	# Analytics Tab
	with gr.TabItem("📊 Analytics", id="analytics"):
	gr.Markdown("### System Analytics")

	with gr.Row():
	with gr.Column():
	doc_count_label = gr.Label(value="0", label="📄 Documents", show_label=True)
	with gr.Column():
	axiom_count_label = gr.Label(value="0", label="⚡ Axioms", show_label=True)
	with gr.Column():
	storage_label = gr.Label(value="0MB", label="💾 Storage Used", show_label=True)

	with gr.Accordion("📈 Recent Activity", open=True):
	activity_log = gr.Dataframe(
	headers=["Action", "Details", "Timestamp"],
	datatype=["str", "str", "str"],
	label="Activity Log",
	wrap=True,
	max_height=300
	)

	# Event handlers
	init_btn.click(
	fn=initialize_app,
	outputs=status_text,
	api_visibility="private"
	)

	# Upload events
	def process_and_update(files):
	if not files:
	return "No files selected", []

	# Process files
	status, icon = process_uploaded_files(files)

	# Create queue table
	queue_data = []
	for f in files:
	name = os.path.basename(f)
	size = os.path.getsize(f) if os.path.exists(f) else 0
	queue_data.append([name, "✅ Processed", size])

	return f"{icon} {status}", queue_data

	upload_btn.click(
	fn=process_and_update,
	inputs=file_output,
	outputs=[upload_status, upload_queue],
	api_visibility="private"
	).then(
	fn=load_documents,
	outputs=documents_table
	).then(
	fn=lambda: load_axioms(),
	outputs=axioms_table
	).then(
	fn=get_stats,
	outputs=[doc_count_label, axiom_count_label, storage_label]
	).then(
	fn=load_activity,
	outputs=activity_log
	)

	# Documents tab events
	def refresh_documents():
	docs = load_documents()
	# Update filter choices
	return docs

	tabs.change(
	fn=refresh_documents,
	outputs=documents_table,
	api_visibility="private"
	)

	clear_docs_btn.click(
	fn=clear_all_data,
	outputs=[status_text],
	api_visibility="private"
	).then(
	fn=load_documents,
	outputs=documents_table
	).then(
	fn=lambda: load_axioms(),
	outputs=axioms_table
	).then(
	fn=get_stats,
	outputs=[doc_count_label, axiom_count_label, storage_label]
	)

	# Axioms tab events
	def update_axiom_filter():
	conn = get_db()
	cursor = conn.cursor()
	cursor.execute("SELECT DISTINCT name FROM documents")
	docs = [row[0] for row in cursor.fetchall()]
	return gr.Dropdown(choices=[""] + docs)

	tabs.change(
	fn=update_axiom_filter,
	outputs=axiom_filter,
	api_visibility="private"
	)

	axiom_filter.change(
	fn=lambda filter_val: load_axioms(filter_val or ""),
	inputs=axiom_filter,
	outputs=axioms_table,
	api_visibility="private"
	)

	export_axioms_btn.click(
	fn=export_axioms,
	outputs=[export_status],
	api_visibility="private"
	)

	# Generate tab events
	generate_btn.click(
	fn=generate_rag_response,
	inputs=[query_input, use_axioms, use_context],
	outputs=[response_output, context_output],
	api_visibility="private"
	).then(
	fn=load_activity,
	outputs=activity_log
	)

	# Load initial data
	demo.load(
	fn=initialize_app,
	outputs=status_text,
	api_visibility="private"
	).then(
	fn=load_documents,
	outputs=documents_table
	).then(
	fn=lambda: load_axioms(),
	outputs=axioms_table
	).then(
	fn=get_stats,
	outputs=[doc_count_label, axiom_count_label, storage_label]
	).then(
	fn=load_activity,
	outputs=activity_log
	).then(
	fn=update_axiom_filter,
	outputs=axiom_filter
	)

	# Launch with Gradio 6 theme
	demo.launch(
	theme=gr.themes.Soft(
	primary_hue="indigo",
	secondary_hue="violet",
	neutral_hue="slate",
	font=gr.themes.GoogleFont("Inter"),
	text_size="lg",
	spacing_size="lg",
	radius_size="md"
	).set(
	button_primary_background_fill="*primary_600",
	button_primary_background_fill_hover="*primary_700",
	block_title_text_weight="600",
	block_background_fill="*neutral_50"
	),
	footer_links=[{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}],
	show_error=True,
	max_threads=40
	)