import { MutableRefObject, useEffect, useRef, useState } from "react";
import { logger } from "../../../utils/logger";
import {
  getSpeechConfig,
  initTokenRefresher,
} from "../../../services/speech/speech-config";
import { useSettings } from "../../../hooks/use-settings";
import useStateRef from "react-usestateref";
import { useSimpleToast } from "../../../hooks/use-simple-toast";
import {
  CancellationReason,
  ResultReason,
  SpeechRecognizer,
} from "microsoft-cognitiveservices-speech-sdk";
import { azureSpeechToText } from "../../../services/speech/azure-speech-to-text";
import { captureException } from "@sentry/nextjs";
import { AzRecognitionResult } from "../../../server/speech/azure/transcribe";
import { getAzureConfidence } from "../../../utils/get-azure-confidence";
import { type MicVAD } from "@ricky0123/vad-web";
import { useAudioMonitor } from "./use-audio-pulse";

export type RecognitionRecording = {
  audioRecording: Blob;
  transcript?: string;
  confidence?: number;
};
export type SpeechRecognitionState = "active" | "inactive";
export type ReadAloudState = "active" | "inactive";

export function useSpeechToText(
  ttsAudioElementRef: MutableRefObject<HTMLAudioElement | undefined>
) {
  const { toastFail } = useSimpleToast();
  const { settings } = useSettings();
  const [isRecording, setIsRecording] = useState(false);

  // Audio recording
  const [recordingMediaStream, setRecordingMediaStream] =
    useState<MediaStream>();
  const { audioPulse, hasPulse, hasPulseRef } = useAudioMonitor(
    recordingMediaStream,
    isRecording
  );
  const [vadStatus, setVadStatus] = useState<
    "idle" | "speech-detected" | "speech-ended" | "misfire" | "failed-to-load"
  >("idle");

  const activeVad = useRef<MicVAD>();
  const loadingSpeechConfig = useRef(false);

  const [_, setSpeechAudioChunks, speechAudioChunksRef] =
    useStateRef<BlobPart[]>();
  const [__, setSpeechAudioRecorder, speechAudioRecorderRef] =
    useStateRef<MediaRecorder>();

  // Real time speech recognition
  const [speechConfigReady, setSpeechConfigReady] = useState(false);
  const speechRecognizer = useRef<SpeechRecognizer>();
  const [completeSpeech, setCompleteSpeech, completeSpeechRef] =
    useStateRef("");
  const [incompleteSpeech, setIncompleteSpeech, incompleteSpeechRef] =
    useStateRef("");
  const [detailedResults, setDetailedResults, detailedResultsRef] = useStateRef<
    AzRecognitionResult[]
  >([]);

  useEffect(() => {
    if (speechConfigReady || loadingSpeechConfig.current) {
      return;
    }
    loadingSpeechConfig.current = true;
    initTokenRefresher();
    getSpeechConfig()
      .then(() => {
        setSpeechConfigReady(true);
      })
      .catch((err) => {
        logger.error("getSpeechConfig failed", err);
        toastFail("Unable to contact the voice recognition server");
      })
      .finally(() => {
        loadingSpeechConfig.current = false;
      });
  }, [speechConfigReady, toastFail]);

  const startSpeechRecognition = async () => {
    if (!ttsAudioElementRef.current) {
      ttsAudioElementRef.current = new Audio();
    }

    const stream = await window.navigator.mediaDevices.getUserMedia({
      audio: {
        channelCount: 1,
        echoCancellation: true,
        autoGainControl: true,
        noiseSuppression: true,
      },
    });

    resetRecognitions(false);
    setRecordingMediaStream(stream);
    setIsRecording(true);

    logger.info(
      `Using ${settings.speechRecognitionProvider} for speech recognition`
    );

    const mimeType = getSupportedMimeType();
    logger.info(`Using ${mimeType} for audio recording`);
    const recorder = new MediaRecorder(stream, {
      mimeType,
      audioBitsPerSecond: 16000,
    });

    const audioChunks: BlobPart[] = [];

    recorder.ondataavailable = (event) => {
      audioChunks.push(event.data);
      setSpeechAudioChunks(audioChunks);
    };

    // https://wiki.vad.ricky0123.com/
    const MicVad = (window as any).vad?.MicVAD as typeof MicVAD | undefined;

    if (MicVad) {
      MicVad.new({
        positiveSpeechThreshold: 0.9,
        stream: stream,
        onSpeechStart: () => {
          setVadStatus("speech-detected");
          logger.success("User started talking");
        },
        onVADMisfire: () => {
          logger.verbose("Vad misfire");
          setVadStatus("misfire");
        },
        onSpeechEnd: (audio: any) => {
          setVadStatus("speech-ended");
          // If we wanted intermediate results we could use this callback to send audio
          // to the speech recognition service on the server
          logger.info("User stopped talking");
        },
      }).then((_vad: MicVAD) => {
        activeVad.current = _vad;
        activeVad.current.start();
      });
    } else {
      setVadStatus("failed-to-load");
      logger.error("VAD failed to load");
    }

    recorder.start();
    activeVad.current?.start();

    setIsRecording(true);
    setSpeechAudioRecorder(recorder);

    if (settings.speechRecognitionProvider === "azure") {
      speechRecognizer.current = await azureSpeechToText(settings);

      speechRecognizer.current.recognizing = (s, e) => {
        setIncompleteSpeech(e.result.text);
      };

      speechRecognizer.current.recognized = (s, e) => {
        if (!speechRecognizer.current) {
          logger.error(
            "Recognized event triggered but speechRecognizer is undefined"
          );
          resetRecognitions();
          return;
        }

        if (
          e.result.reason == ResultReason.RecognizedSpeech &&
          e.result.text.trim()
        ) {
          const completeSpeech =
            completeSpeechRef.current + " " + e.result.text.trim();

          try {
            const detailedResult = JSON.parse(
              e.result.json
            ) as AzRecognitionResult;
            setDetailedResults([...detailedResultsRef.current, detailedResult]);
          } catch (error: any) {
            logger.warn("Detailed results error:" + error?.message);
          }

          logger.info(`Recognized: ${completeSpeech}`);
          setIncompleteSpeech("");
          setCompleteSpeech(completeSpeech);
        }
      };
      speechRecognizer.current.sessionStopped = (s, e) => {};

      speechRecognizer.current.canceled = (s, e) => {
        console.log("MUST NOT HAPPEN", s, e);
        if (e.reason == CancellationReason.Error) {
          logger.error(
            `"MUST NOT HAPPEN. CANCELED: ErrorDetails=${e.errorDetails}`
          );
        } else {
          logger.error(`"MUST NOT HAPPEN. CANCELED: Reason=${e.reason}`);
        }

        stopSpeechRecognition();
      };
    }
  };

  const stopSpeechRecognition = (): Promise<RecognitionRecording> => {
    return new Promise((resolve, reject) => {
      if (!speechAudioRecorderRef?.current) {
        reject("no audio recorder initialized");
        return;
      }

      const onResolve = () => {
        const mimeType = getSupportedMimeType();
        const audioBlob = new Blob(speechAudioChunksRef.current, {
          type: mimeType,
        });

        resetMediaRecorder();
        setIsRecording(false);

        if (activeVad.current && activeVad.current.options.stream?.active) {
          activeVad.current.destroy();
          setVadStatus("idle");
        }

        if (!hasPulseRef.current) {
          speechRecognizer.current?.close();
          reject("no audio detected in stopSpeechRecognition");
          return;
        }

        logger.verbose(`Blob size: ${audioBlob.size} bytes`);

        if (settings.speechRecognitionProvider === "azure") {
          if (!speechRecognizer.current) {
            reject("no speech recognizer initialized");
            return;
          }

          const timer = setTimeout(() => {
            logger.error(
              "Speech end not detected. Fallback: submitting audio blob"
            );
            captureException(
              "Speech end not detected. Fallback: submitting audio blob"
            );
            resetRecognitions();
            return resolve({ audioRecording: audioBlob });
          }, 5000);

          speechRecognizer.current.speechEndDetected = () => {
            logger.info("Speech end detected");

            clearTimeout(timer);

            let transcript = completeSpeechRef.current.trim();

            if (settings.submitWithTriggerWord) {
              const triggerWord =
                settings.triggerWord?.toLocaleLowerCase() ?? "";
              const regex = new RegExp(`${triggerWord}$`, "i"); // Case-insensitive match at the end of the string
              transcript = transcript.replace(regex, "");
            }

            logger.info(`Final transcript: ${transcript}`);

            const confidence = getAzureConfidence(detailedResultsRef.current);

            resetRecognitions();

            return resolve({
              audioRecording: audioBlob,
              transcript,
              confidence,
            });
          };

          const stopWhenReady = () => {
            const noPendingSpeech = incompleteSpeechRef.current?.length === 0;
            logger.verbose(`Speech is complete: ${noPendingSpeech}`);

            if (noPendingSpeech) {
              speechRecognizer.current?.stopContinuousRecognitionAsync();
            } else {
              setTimeout(stopWhenReady, 500);
            }
          };

          stopWhenReady();
        } else {
          logger.info("Submit audio blob");
          return resolve({ audioRecording: audioBlob });
        }
      };

      const timer = setTimeout(() => onResolve, 3000);

      if (speechAudioRecorderRef.current.state === "inactive") {
        onResolve();
      } else {
        // Calling .onstop on inactive recorder will cause cryptic crashes, so we need to check
        speechAudioRecorderRef.current.onstop = () => {
          clearTimeout(timer);
          onResolve();
        };

        speechAudioRecorderRef.current?.stop();
      }
    });
  };

  const resetMediaRecorder = () => {
    setRecordingMediaStream((recordingMediaStream) => {
      if (recordingMediaStream) {
        let tracks = recordingMediaStream.getTracks(); // get all tracks from the stream
        tracks.forEach((track) => track.stop()); // stop each track
      }

      return undefined;
    });
  };

  const resetRecognitions = (unmount = true) => {
    logger.success("Clean up: resetting recognitions");
    setCompleteSpeech("");
    setIncompleteSpeech("");
    setSpeechAudioChunks([]);
    setDetailedResults([]);

    if (unmount && speechRecognizer.current) {
      logger.verbose("Clean up: close and unmount speech recognizer");
      speechRecognizer.current.close();
      speechRecognizer.current = undefined;
    }
  };

  return {
    isRecording,
    recordingAudioPulse: audioPulse,
    startSpeechRecognition,
    stopSpeechRecognition,
    resetRecognitions,
    hasPulse,
    vadStatus,
    transcript:
      settings.speechRecognitionProvider === "azure"
        ? completeSpeech
        : undefined,
  };
}

function getSupportedMimeType() {
  const types = [
    "audio/webm;codecs=opus",
    "audio/ogg;codecs=opus",
    "audio/webm",
    "audio/mp4;codecs=mp4a.40.2",
  ];

  for (let i = 0; i < types.length; i++) {
    if (MediaRecorder.isTypeSupported(types[i])) {
      return types[i];
    }
  }
  return types[3];
}
