Spaces:

cihatyldz
/

telgraf

Sleeping

App Files Files Community

telgraf / app.py

cihatyldz

Upload app.py with huggingface_hub

1f864dd verified 18 days ago

raw

history blame contribute delete

11.5 kB


	import gradio as gr
	import json
	import os
	import re
	import time
	from collections import defaultdict

	import networkx as nx
	from community.community_louvain import best_partition
	import plotly.graph_objects as go
	import chromadb
	from sentence_transformers import SentenceTransformer
	from huggingface_hub import InferenceClient, hf_hub_download

	# ============================================================
	# CONFIG
	# ============================================================
	HF_TOKEN = os.environ.get("HF_TOKEN")
	LLM_MODEL = "Qwen/Qwen2.5-7B-Instruct"
	EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
	DATASET_REPO = "cihatyldz/telgraf-telconet-dataset"

	llm_client = InferenceClient(token=HF_TOKEN)
	embed_model = SentenceTransformer(EMBEDDING_MODEL)

	# ============================================================
	# LOAD DATA
	# ============================================================
	DOC_FILES = [
	"data/01_kurumsal_yapi.txt",
	"data/02_ag_altyapisi.txt",
	"data/03_ariza_kayitlari.txt",
	"data/04_sla_musteri.txt",
	"data/05_teknoloji_yol_haritasi.txt"
	]

	documents = {}
	for f in DOC_FILES:
	path = hf_hub_download(repo_id=DATASET_REPO, filename=f, repo_type="dataset", token=HF_TOKEN)
	name = f.split("/")[-1].replace(".txt", "")
	with open(path, "r", encoding="utf-8") as fp:
	documents[name] = fp.read()

	graph_path = hf_hub_download(repo_id=DATASET_REPO, filename="graph_data.json", repo_type="dataset", token=HF_TOKEN)
	with open(graph_path, "r", encoding="utf-8") as fp:
	graph_data = json.load(fp)

	print(f"Loaded {len(documents)} documents, {len(graph_data['entities'])} entities")

	# ============================================================
	# CHUNKING & INDEX
	# ============================================================
	def semantic_chunk(text, max_chunk_size=600):
	paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
	chunks, current = [], ""
	for para in paragraphs:
	if len(current) + len(para) <= max_chunk_size:
	current += para + " "
	else:
	if current:
	chunks.append(current.strip())
	current = para + " "
	if current.strip():
	chunks.append(current.strip())
	return chunks

	all_chunks, chunk_metadata = [], []
	for doc_name, doc_text in documents.items():
	for i, chunk in enumerate(semantic_chunk(doc_text)):
	all_chunks.append(chunk)
	chunk_metadata.append({"doc_name": doc_name, "chunk_id": f"{doc_name}_chunk_{i}"})

	# ChromaDB
	chroma_client = chromadb.Client()
	collection = chroma_client.create_collection(name="telconet", metadata={"hnsw:space": "cosine"})
	embeddings = embed_model.encode(all_chunks).tolist()
	collection.add(
	ids=[m["chunk_id"] for m in chunk_metadata],
	documents=all_chunks,
	embeddings=embeddings,
	metadatas=chunk_metadata
	)

	# ============================================================
	# REBUILD GRAPH
	# ============================================================
	def normalize_name(name):
	return name.strip().lower().replace(" ", " ")

	entity_map = {}
	for e in graph_data["entities"]:
	key = normalize_name(e["name"])
	entity_map[key] = e

	G = nx.Graph()
	for key, info in entity_map.items():
	G.add_node(key, label=info["name"], type=info.get("type", "UNKNOWN"))

	for r in graph_data["relationships"]:
	src = normalize_name(r.get("source", ""))
	tgt = normalize_name(r.get("target", ""))
	if src in entity_map and tgt in entity_map:
	G.add_edge(src, tgt, relation=r.get("relation", ""))

	partition = {k: int(v) for k, v in graph_data.get("partition", {}).items() if k in G}
	community_summaries = graph_data.get("community_summaries", {})

	# ============================================================
	# QUERY FUNCTIONS
	# ============================================================
	def llm_generate(prompt, max_tokens=500):
	resp = llm_client.chat_completion(
	model=LLM_MODEL,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=max_tokens,
	temperature=0.2,
	)
	return resp.choices[0].message.content

	def standard_rag_query(question, top_k=5):
	q_emb = embed_model.encode([question]).tolist()
	results = collection.query(query_embeddings=q_emb, n_results=top_k)
	context = "\n\n---\n\n".join(results["documents"][0])
	prompt = f"""Aşağıdaki bağlam bilgilerini kullanarak soruyu Türkçe yanıtla.
	Sadece bağlamda bulunan bilgileri kullan.

	Bağlam:
	{context}

	Soru: {question}

	Yanıt:"""
	answer = llm_generate(prompt)
	sources = list(set(s["doc_name"] for s in results["metadatas"][0]))
	return answer, f"Kaynak: {', '.join(sources)} \| {top_k} chunk kullanıldı"

	def graphrag_query(question, top_k_comm=3, top_k_chunks=3):
	q_emb = embed_model.encode([question])[0]
	comm_scores = {}
	for cid, summary in community_summaries.items():
	s_emb = embed_model.encode([summary])[0]
	score = float(q_emb @ s_emb) / (float((q_emb2).sum()0.5) * float((s_emb2).sum()0.5))
	comm_scores[cid] = score
	top_comms = sorted(comm_scores.items(), key=lambda x: -x[1])[:top_k_comm]

	relevant_nodes = set()
	for cid, _ in top_comms:
	for n, c in partition.items():
	if c == int(cid):
	relevant_nodes.add(n)
	for nb in G.neighbors(n) if n in G else []:
	relevant_nodes.add(nb)

	parts = []
	for cid, score in top_comms:
	parts.append(f"[Community {cid} - Skor: {score:.2f}]\n{community_summaries.get(str(cid), '')}")

	rels = []
	for u, v, d in G.edges(data=True):
	if u in relevant_nodes or v in relevant_nodes:
	rels.append(f"{entity_map.get(u,{}).get('name',u)} → {d.get('relation','?')} → {entity_map.get(v,{}).get('name',v)}")
	if rels:
	parts.append("\nİlişkiler:\n" + "\n".join(rels[:30]))

	q_emb_list = embed_model.encode([question]).tolist()
	chunk_res = collection.query(query_embeddings=q_emb_list, n_results=top_k_chunks)
	for ch in chunk_res["documents"][0]:
	parts.append(f"\n[Metin]\n{ch}")

	prompt = f"""Bilgi grafiği ve bağlamı kullanarak soruyu Türkçe yanıtla. İlişkileri takip ederek multi-hop çıkarımlar yap.

	Bağlam:
	{"\n\n".join(parts)}

	Soru: {question}

	Yanıt:"""
	answer = llm_generate(prompt)
	info = f"Community: {[c[0] for c in top_comms]} \| {len(relevant_nodes)} node, {len(rels)} ilişki"
	return answer, info

	# ============================================================
	# GRAPH VISUALIZATION
	# ============================================================
	def build_graph_figure():
	if G.number_of_nodes() == 0:
	return go.Figure()
	pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
	colors = ['#FF6B6B','#4ECDC4','#45B7D1','#96CEB4','#FFEAA7','#DDA0DD','#98D8C8','#F7DC6F','#BB8FCE','#85C1E9']
	edge_x, edge_y = [], []
	for u, v in G.edges():
	x0,y0 = pos[u]; x1,y1 = pos[v]
	edge_x.extend([x0,x1,None]); edge_y.extend([y0,y1,None])
	fig = go.Figure()
	fig.add_trace(go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#ccc'), mode='lines', hoverinfo='none', showlegend=False))
	comm_set = set(partition.values()) if partition else {0}
	for cid in comm_set:
	nodes = [n for n,c in partition.items() if c == cid and n in pos]
	if not nodes:
	continue
	fig.add_trace(go.Scatter(
	x=[pos[n][0] for n in nodes], y=[pos[n][1] for n in nodes],
	mode='markers+text',
	text=[entity_map.get(n,{}).get("name",n)[:18] for n in nodes],
	textposition="top center", textfont=dict(size=7),
	hovertext=[f"<b>{entity_map.get(n,{}).get('name',n)}</b><br>Tür: {entity_map.get(n,{}).get('type','?')}<br>Bağlantı: {G.degree(n)}" for n in nodes],
	hoverinfo='text',
	marker=dict(size=[max(8,min(35,G.degree(n)*3)) for n in nodes], color=colors[cid%len(colors)], line=dict(width=1,color='white')),
	name=f'Community {cid}'
	))
	fig.update_layout(title='🔌 Telgraf — TelcoNet Knowledge Graph', showlegend=True,
	xaxis=dict(showgrid=False,zeroline=False,showticklabels=False),
	yaxis=dict(showgrid=False,zeroline=False,showticklabels=False),
	plot_bgcolor='white', height=600)
	return fig

	GRAPH_FIG = build_graph_figure()

	# ============================================================
	# GRADIO UI
	# ============================================================
	EXAMPLE_QUESTIONS = [
	"TelcoNet'in CEO'su kimdir?",
	"Bolu fiber kesintisinden hangi müşteriler etkilendi ve toplam tazminat ne kadar?",
	"CTO Elif Demir'e bağlı departman müdürleri kimler ve hangi projelerin sponsorluğunu yapıyor?",
	"Ericsson'un TelcoNet'teki tüm rolleri neler?",
	"DDoS saldırısına kim müdahale etti ve sonrasında hangi projeler başlatıldı?",
	"TelcoNet'in 2026 en büyük teknoloji riskleri ve başlatılan projeler nelerdir?"
	]

	def compare(question):
	rag_ans, rag_info = standard_rag_query(question)
	graph_ans, graph_info = graphrag_query(question)
	return rag_ans, rag_info, graph_ans, graph_info

	def show_graph():
	return GRAPH_FIG

	with gr.Blocks(title="🔌 Telgraf", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🔌 Telgraf — GraphRAG vs Standart RAG")
	gr.Markdown("Telekom domain'inde Knowledge Graph tabanlı bilgi erişimi karşılaştırması")

	with gr.Tab("⚔️ Karşılaştır"):
	question = gr.Textbox(label="Sorunuzu yazın", placeholder="Örn: Bolu fiber kesintisinin etkileri neler?", lines=2)
	gr.Examples(examples=[[q] for q in EXAMPLE_QUESTIONS], inputs=question)
	btn = gr.Button("🔍 Karşılaştır", variant="primary")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 📘 Standart RAG")
	rag_answer = gr.Textbox(label="Cevap", lines=10)
	rag_meta = gr.Textbox(label="Detay", lines=2)
	with gr.Column():
	gr.Markdown("### 📗 GraphRAG")
	graph_answer = gr.Textbox(label="Cevap", lines=10)
	graph_meta = gr.Textbox(label="Detay", lines=2)

	btn.click(compare, inputs=question, outputs=[rag_answer, rag_meta, graph_answer, graph_meta])

	with gr.Tab("🕸️ Knowledge Graph"):
	gr.Markdown("### İnteraktif Bilgi Grafiği")
	graph_btn = gr.Button("📊 Grafiği Göster")
	graph_plot = gr.Plot()
	graph_btn.click(show_graph, outputs=graph_plot)

	with gr.Tab("📄 Dokümanlar"):
	for name, text in documents.items():
	with gr.Accordion(name, open=False):
	gr.Textbox(value=text, lines=15, interactive=False)

	with gr.Tab("ℹ️ Hakkında"):
	gr.Markdown("""
	## 🔌 Telgraf Projesi

	GraphRAG vs Standart RAG karşılaştırma demo'su.

	Kurgusal şirket: TelcoNet A.Ş. (Türk telekom operatörü)

	\| Özellik \| Standart RAG \| GraphRAG \|
	\|---\|---\|---\|
	\| Basit sorular \| ✅ İyi \| ✅ İyi \|
	\| Multi-hop \| ❌ Zayıf \| ✅ Güçlü \|
	\| İlişki zinciri \| ❌ Kuramıyor \| ✅ Graph traversal \|
	\| Global özet \| ❌ Yetersiz \| ✅ Community summaries \|

	Tech: Qwen2.5-7B, ChromaDB, NetworkX, Sentence-Transformers

	Geliştiren: [Cihat Yıldız](https://huggingface.co/cihatyldz)
	""")

	demo.launch()