import torch from transformers import pipeline import librosa import os from hugchat import hugchat from hugchat.login import Login import gradio as gr import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Model and device configuration for transcription MODEL_NAME = "openai/whisper-large-v3-turbo" device = 0 if torch.cuda.is_available() else "cpu" # Initialize Whisper pipeline pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device, ) # Hugging Face Chatbot credentials from environment variables (preferred for Spaces) EMAIL = os.getenv("EMAIL", "fearfreed007@gmail.com") # Fallback for local testing PASSWD = os.getenv("PASSWD", "uS&m?UrB)7Y7XTP") # Fallback for local testing # Directory to save cookies cookie_path_dir = "./cookies/" os.makedirs(cookie_path_dir, exist_ok=True) # Initialize chatbot with error handling chatbot = None try: sign = Login(EMAIL, PASSWD) cookies = sign.login(cookie_dir_path=cookie_path_dir, save_cookies=True) chatbot = hugchat.ChatBot(cookies=cookies.get_dict()) logger.info("Chatbot initialized successfully") except Exception as e: logger.error(f"Failed to initialize chatbot: {e}") def transcribe_audio(audio_path): """Transcribe a local audio file using the Whisper pipeline.""" try: if not os.path.exists(audio_path): raise FileNotFoundError("Audio file not found") audio, sr = librosa.load(audio_path, sr=16000, mono=True) transcription = pipe(audio, batch_size=8, generate_kwargs={"language": "urdu"})["text"] return transcription except Exception as e: return f"Error processing audio: {str(e)}" def extract_info_from_filename(filename): """Extract agent, file_number, city, and country from the filename.""" try: parts = filename.split('_') if len(parts) < 4: raise ValueError("Filename must have at least 4 parts: agentX_N_City_Country") agent = parts[0] file_number = int(parts[1]) city = parts[2] country = parts[3].split('.')[0] # Remove file extension if present return agent, file_number, city, country except (ValueError, IndexError): return None, None, None, None def process_audio(audio_path): """Process audio: Extract info from filename, transcribe, and generate JSON.""" if not audio_path: return '{"error": "No audio file provided"}', "", "" # Extract filename and info filename = os.path.basename(audio_path) agent, file_number, city, country = extract_info_from_filename(filename) if agent is None: return '{"error": "Invalid filename format. Use format: agentX_N_City_Country.wav"}', "", filename # Transcribe audio transcription = transcribe_audio(audio_path) if "Error" in transcription: return f'{{"error": "{transcription}"}}', transcription, filename # Fallback JSON if chatbot is not initialized if chatbot is None: logger.warning("Chatbot unavailable, returning transcription-only JSON") return ( f'{{"records": [{{"Recording_name": "{filename}", "agent": "{agent}", "file_number": {file_number}, ' f'"city": "{city}", "country": "{country}", "transcription": "{transcription}"}}]}}', transcription, filename ) # Construct prompt with extracted data prompt = f""" Correct the given Urdu text for grammar, word accuracy, and contextual meaning without adding anything extra. Then, translate the corrected text into English. Next, create a JSON file that detects crops and their diseases, following this format: {{ "records": [ {{ "Recording_name": "{filename}", "agent": "{agent}", "file_number": {file_number}, "city": "{city}", "country": "{country}", "crops": [ {{ "name": "", "season": "", "harvest_months": [""], "regions": [""], "diseases": [ {{ "name": "", "description": "", "wikipedia_link": "" }} ] }} ], "issues": [""], "disease_linking": {{ "": [""] }} }} ] }} The Urdu text to process is: {transcription} Only provide the JSON output, do not include any additional text. """ # Process with chatbot and return JSON try: response = chatbot.chat(prompt).wait_until_done() return response, transcription, filename except Exception as e: logger.error(f"Chatbot processing failed: {e}") return ( f'{{"records": [{{"Recording_name": "{filename}", "agent": "{agent}", "file_number": {file_number}, ' f'"city": "{city}", "country": "{country}", "transcription": "{transcription}", ' f'"error": "Chatbot processing failed: {str(e)}"}}]}}', transcription, filename ) # Gradio Interface with gr.Blocks(title="Audio Transcription and Crop Analysis") as interface: gr.Markdown("## Audio Transcription and Crop Disease Analysis") with gr.Row(): audio_input = gr.Audio(type="filepath", label="Upload Audio File (e.g., agent1_2_Multan_Pakistan.wav)") with gr.Row(): json_output = gr.Textbox(label="JSON Output", interactive=False, lines=10) transcription_output = gr.Textbox(label="Transcription (Urdu)", interactive=False, lines=5) filename_output = gr.Textbox(label="Processed Filename", interactive=False) process_button = gr.Button("Process Audio") process_button.click( fn=process_audio, inputs=[audio_input], outputs=[json_output, transcription_output, filename_output], ) if __name__ == "__main__": interface.launch(server_name="0.0.0.0", server_port=7860)