import os import subprocess import pandas as pd import folium from wordcloud import WordCloud import matplotlib.pyplot as plt from textblob import TextBlob from gtts import gTTS import speech_recognition as sr from deep_translator import GoogleTranslator from collections import Counter import re import json # We will import _index from rag_pipeline to get documents import rag_pipeline # ── Map Generation ────────────────────────────────────────────── LOCATIONS = [ {"name": "Jerusalem (Al-Quds)", "lat": 31.7683, "lon": 35.2137, "query": "Jerusalem Al-Quds occupation history destruction", "desc": "The capital of Palestine, central to its history, culture, and religious identity."}, {"name": "Gaza", "lat": 31.5017, "lon": 34.4668, "query": "Gaza destruction casualties humanitarian crisis displaced", "desc": "One of the oldest cities; subject of military operations and humanitarian siege."}, {"name": "Ramallah", "lat": 31.9038, "lon": 35.2034, "query": "Ramallah West Bank Palestinian Authority", "desc": "A major Palestinian cultural and political center in the West Bank."}, {"name": "Hebron (Al-Khalil)", "lat": 31.5326, "lon": 35.0998, "query": "Hebron Al-Khalil settlements occupation", "desc": "A historic city known for the Ibrahimi Mosque and traditional crafts."}, {"name": "Nablus", "lat": 32.2211, "lon": 35.2544, "query": "Nablus West Bank raids settlements", "desc": "Famous for its traditional soap, knafeh, and historic old city."}, {"name": "Haifa", "lat": 32.7940, "lon": 34.9896, "query": "Haifa Nakba 1948 Palestinian expelled", "desc": "A historic coastal city, largely depopulated during the 1948 Nakba."}, {"name": "Jaffa (Yafa)", "lat": 32.0504, "lon": 34.7522, "query": "Jaffa Yafa Nakba 1948 destruction port expelled", "desc": "Historically one of Palestine's most important port cities, depopulated in 1948."}, {"name": "Rafah", "lat": 31.2956, "lon": 34.2527, "query": "Rafah crossing humanitarian aid evacuation bombardment", "desc": "A border city in southern Gaza; key crossing for humanitarian aid."}, {"name": "Khan Yunis", "lat": 31.3436, "lon": 34.3061, "query": "Khan Yunis destruction bombardment casualties", "desc": "One of Gaza's largest cities, heavily affected by military operations."}, {"name": "Jenin", "lat": 32.4641, "lon": 35.2961, "query": "Jenin refugee camp military operation incursion", "desc": "Home to one of the West Bank's largest refugee camps."}, ] def _get_location_facts(query: str) -> str: """Retrieve document excerpts relevant to a location. Returns formatted HTML.""" if rag_pipeline._retriever is None: return "" try: nodes = rag_pipeline._retriever.retrieve(query) if not nodes: return "" snippets = [] seen = set() for node in nodes[:3]: text = node.node.get_content()[:280].strip().replace("\n", " ") source = node.node.metadata.get("source", "") page = node.node.metadata.get("page_number", "?") key = (source, page) if key in seen: continue seen.add(key) src_label = (source[:45] + "...") if len(source) > 45 else source snippets.append( f'
' f'"{text}..."
' f'— {src_label}, p.{page}' f'
' ) return "".join(snippets) except Exception: return "" def generate_map(): m = folium.Map(location=[31.5, 34.8], zoom_start=8, tiles="CartoDB positron") for loc in LOCATIONS: doc_facts = _get_location_facts(loc["query"]) popup_html = ( f'
' f'

{loc["name"]}

' f'

{loc["desc"]}

' ) if doc_facts: popup_html += ( f'
' f'

' f'📄 From the Documents:

' f'{doc_facts}' ) popup_html += "
" folium.Marker( location=[loc["lat"], loc["lon"]], popup=folium.Popup(popup_html, max_width=360), tooltip=folium.Tooltip(loc["name"], sticky=True), icon=folium.Icon(color="red", icon="info-sign"), ).add_to(m) map_path = "palestine_map.html" m.save(map_path) return map_path # ── Timeline Generation ────────────────────────────────────────────── def generate_timeline(): timeline_html = """

Historical Timeline of the Palestinian Cause

""" return timeline_html # ── Word Cloud Generation ────────────────────────────────────────────── def generate_wordcloud(doc_name="All"): if rag_pipeline._index is None: return None docstore = rag_pipeline._index.docstore nodes = list(docstore.docs.values()) text = "" for node in nodes: if doc_name is None or doc_name == "All" or node.metadata.get("source") == doc_name: text += node.get_content() + " " if not text.strip(): # Fallback if no text text = "Palestine History Culture Rights Peace Justice Freedom" wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') img_path = "wordcloud.png" plt.savefig(img_path, bbox_inches='tight') plt.close() return img_path # ── Statistics Generation ────────────────────────────────────────────── def get_statistics(): if rag_pipeline._index is None: return pd.DataFrame(), pd.DataFrame() docstore = rag_pipeline._index.docstore nodes = list(docstore.docs.values()) data = [] for node in nodes: source = node.metadata.get("source", "Unknown") page = node.metadata.get("page_number", 0) length = len(node.get_content()) data.append({"Source": source, "Page": page, "Length": length}) df = pd.DataFrame(data) if df.empty: return pd.DataFrame(), pd.DataFrame() stats_df = df.groupby('Source').agg( Chunks=('Source', 'count'), Total_Length=('Length', 'sum'), Avg_Length=('Length', 'mean') ).reset_index() return stats_df, df # ── Advanced Analytics ────────────────────────────────────────────── def advanced_analytics(text): if not text or not text.strip(): return "No text provided for analysis." blob = TextBlob(text) sentiment = blob.sentiment sentiment_str = f"Polarity: {sentiment.polarity:.2f} (Negative < 0 < Positive), Subjectivity: {sentiment.subjectivity:.2f} (Objective < 0.5 < Subjective)" words = re.findall(r'\b[A-Z][a-z]+\b', text) freq = Counter(words) common_entities = freq.most_common(10) analytics_report = f"### Sentiment Analysis\n{sentiment_str}\n\n" analytics_report += "### Frequent Capitalized Entities (Heuristic)\n" for ent, count in common_entities: analytics_report += f"- **{ent}**: {count}\n" return analytics_report # ── Audio Features ────────────────────────────────────────────── def text_to_speech(text, lang='en'): try: tts = gTTS(text=text, lang=lang) output_path = "output_audio.mp3" tts.save(output_path) return output_path except Exception as e: print(f"TTS Error: {e}") return None def speech_to_text(audio_path): if not audio_path: return "" wav_path = audio_path if not audio_path.lower().endswith(".wav"): wav_path = "temp_stt.wav" try: subprocess.run(["ffmpeg", "-y", "-i", audio_path, wav_path], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) except Exception as e: error_msg = f"[Voice Input Error: Audio conversion failed: {str(e)}]" print(error_msg) return error_msg r = sr.Recognizer() try: with sr.AudioFile(wav_path) as source: audio_data = r.record(source) text = r.recognize_google(audio_data) return text except Exception as e: error_msg = f"[Voice Input Error: {str(e)}]" print(error_msg) return error_msg # ── Translation ────────────────────────────────────────────── def translate_text(text, target_lang='en'): try: translator = GoogleTranslator(source='auto', target=target_lang) return translator.translate(text) except Exception as e: print(f"Translation Error: {e}") return text # ── Export Chat ────────────────────────────────────────────── def export_chat_history(history): if not history: return None # Convert objects to dicts if necessary for JSON serialization cleaned_history = [] for msg in history: if isinstance(msg, dict): cleaned_history.append(msg) else: cleaned_history.append({"role": getattr(msg, "role", "unknown"), "content": getattr(msg, "content", "")}) file_path = "chat_history.json" with open(file_path, "w", encoding="utf-8") as f: json.dump(cleaned_history, f, ensure_ascii=False, indent=4) return file_path