- Azure
- Computer Vision API(画像分析 API)
- Speech SDK 1.14.0
- Unity 2019.4.1f1
- MRTK 2.5.1
- Windows 10 PC
- HoloLens2
2.Speech SDKを設定する(Unity)からMicrosoft.CognitiveServices.Speech.1.14.0.unitypackageをダウンロードし、インポートします。
4.スピーカー出力に合成するを参考にTapToCaptureAnalyzeAPI.csを編集します。画像分析APIで得られた画像説明文をSpeech SDKのsynthesizer.SpeakTextAsyncに投げます。
TapToCaptureAnalyzeAPI.cs using System.Collections; using System.Collections.Generic; using System.Linq; using System; using UnityEngine; using Microsoft.MixedReality.Toolkit.Utilities; using System.Threading.Tasks; using OpenCVForUnity.CoreModule; using OpenCVForUnity.UnityUtils; using OpenCVForUnity.ImgprocModule; // SpeechSDK 追加分ここから using System.IO; using System.Text; using Microsoft.CognitiveServices.Speech; using Microsoft.CognitiveServices.Speech.Audio; // SpeechSDK 追加分ここまで public class TapToCaptureAnalyzeAPI : MonoBehaviour { // SpeechSDK 追加分ここから public AudioSource audioSource; async Task SynthesizeAudioAsync(string text) { var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion"); var synthesizer = new SpeechSynthesizer(config, null); // nullを省略するとPCのスピーカーから出力されるが、HoloLensでは出力されない。 // https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/quickstart/csharp/unity/text-to-speech/Assets/Scripts/HelloWorld.cs // Starts speech synthesis, and returns after a single utterance is synthesized. using (var result = synthesizer.SpeakTextAsync(text).Result) { // Checks result. if (result.Reason == ResultReason.SynthesizingAudioCompleted) { // Native playback is not supported on Unity yet (currently only supported on Windows/Linux Desktop). // Use the Unity API to play audio here as a short term solution. // Native playback support will be added in the future release. var sampleCount = result.AudioData.Length / 2; var audioData = new float[sampleCount]; for (var i = 0; i < sampleCount; ++i) { audioData[i] = (short)(result.AudioData[i * 2 + 1] << 8 | result.AudioData[i * 2]) / 32768.0F; } // The output audio format is 16K 16bit mono var audioClip = AudioClip.Create("SynthesizedAudio", sampleCount, 1, 16000, false); audioClip.SetData(audioData, 0); audioSource.clip = audioClip; audioSource.Play(); // newMessage = "Speech synthesis succeeded!"; } else if (result.Reason == ResultReason.Canceled) { var cancellation = SpeechSynthesisCancellationDetails.FromResult(result); // newMessage = $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?"; } } } // SpeechSDK 追加分ここまで public GameObject quad; [System.Serializable] public class Analyze { public Categories[] categories; public Color color; public Description description; public string requestId; public Metadata metadata; } [System.Serializable] public class Categories { public string name; public float score; } [System.Serializable] public class Color { public string dominantColorForeground; public string dominantColorBackground; public string[] dominantColors; public string accentColor; public bool isBwImg; public bool isBWImg; } [System.Serializable] public class Description { public string[] tags; public Captions[] captions; } [System.Serializable] public class Captions { public string text; public float confidence; } [System.Serializable] public class Metadata { public int height; public int width; public string format; } UnityEngine.Windows.WebCam.PhotoCapture photoCaptureObject = null; Texture2D targetTexture = null; private string endpoint = "https:///vision/v3.1/analyze"; private string subscription_key = ""; private bool waitingForCapture; void Start(){ waitingForCapture = false; } public void AirTap() { if (waitingForCapture) return; waitingForCapture = true; Resolution cameraResolution = UnityEngine.Windows.WebCam.PhotoCapture.SupportedResolutions.OrderByDescending((res) => res.width * res.height).First(); targetTexture = new Texture2D(cameraResolution.width, cameraResolution.height); // PhotoCapture オブジェクトを作成します UnityEngine.Windows.WebCam.PhotoCapture.CreateAsync(false, delegate (UnityEngine.Windows.WebCam.PhotoCapture captureObject) { photoCaptureObject = captureObject; UnityEngine.Windows.WebCam.CameraParameters cameraParameters = new UnityEngine.Windows.WebCam.CameraParameters(); cameraParameters.hologramOpacity = 0.0f; cameraParameters.cameraResolutionWidth = cameraResolution.width; cameraParameters.cameraResolutionHeight = cameraResolution.height; cameraParameters.pixelFormat = UnityEngine.Windows.WebCam.CapturePixelFormat.BGRA32; // カメラをアクティベートします photoCaptureObject.StartPhotoModeAsync(cameraParameters, delegate (UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result) { // 写真を撮ります photoCaptureObject.TakePhotoAsync(OnCapturedPhotoToMemoryAsync); }); }); } async void OnCapturedPhotoToMemoryAsync(UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result, UnityEngine.Windows.WebCam.PhotoCaptureFrame photoCaptureFrame) { // ターゲットテクスチャに RAW 画像データをコピーします photoCaptureFrame.UploadImageDataToTexture(targetTexture); byte[] bodyData = targetTexture.EncodeToJPG(); Response response = new Response(); try { string query = endpoint + "?visualFeatures=Categories,Description,Color"; var headers = new Dictionary<string, string>(); headers.Add("Ocp-Apim-Subscription-Key", subscription_key); response = await Rest.PostAsync(query, bodyData, headers, -1, true); } catch (Exception e) { photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode); return; } if (!response.Successful) { photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode); return; } Debug.Log(response.ResponseCode); Debug.Log(response.ResponseBody); Analyze analyze = JsonUtility.FromJson(response.ResponseBody); Debug.Log(analyze.description.captions[0].text); // SpeechSDK 追加分ここから // 生成された画像説明文をSynthesizeAudioAsyncに投げる await SynthesizeAudioAsync(analyze.description.captions[0].text); // SpeechSDK 追加分ここまで // OpenCVを用いて結果をて画像に書き込み Mat imgMat = new Mat(targetTexture.height, targetTexture.width, CvType.CV_8UC4); Utils.texture2DToMat(targetTexture, imgMat); Debug.Log("imgMat.ToString() " + imgMat.ToString()); Imgproc.putText(imgMat, analyze.description.captions[0].text, new Point(10, 100), Imgproc.FONT_HERSHEY_SIMPLEX, 4.0, new Scalar(255, 255, 0, 255), 4, Imgproc.LINE_AA, false); Texture2D texture = new Texture2D(imgMat.cols(), imgMat.rows(), TextureFormat.RGBA32, false); Utils.matToTexture2D(imgMat, texture); Renderer quadRenderer = quad.GetComponent() as Renderer; quadRenderer.material.SetTexture("_MainTex", texture); // カメラを非アクティブにします photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode); } void OnStoppedPhotoMode(UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result) { // photo capture のリソースをシャットダウンします photoCaptureObject.Dispose(); photoCaptureObject = null; waitingForCapture = false; } } |
static async Task SynthesizeAudioAsync() { var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion"); using var synthesizer = new SpeechSynthesizer(config); await synthesizer.SpeakTextAsync("Synthesizing directly to speaker output."); } |
7.そこで、InternetClientServer と PrivateNetworkClientServer の機能を有効にしたり、MixedRealityToolkitの音声コマンドの設定をしてみたり、
音声認識と文字起こしには Speech SDK を使用するため、Speech SDK の機能の妨げにならないように、MRTK の音声コマンドを構成する必要があります。 これを実現するには、音声コマンドの開始動作を Auto Start から Manual Start に変更することができます。
Hierarchy ウィンドウで MixedRealityToolkit オブジェクトを選択した状態で、Inspector ウィンドウで Input タブを選択し、DefaultHoloLens2InputSystemProfile と DefaultMixedRealitySpeechCommandsProfile を複製し、音声コマンドの Start Behavior を Manual Start に変更します。
8.調べた結果【Unity】Microsoft Azure を用いてキャラクターを流暢に話させる
9.あとはTapToCaptureAnalyzeにAudioSourceをAdd Componentし、TapToCaptureAnalyzeAPIのAudioSourceにTapToCaptureAnalyzeをD&Dしてアタッチします。
“a hand holding a fanned out money” 札束を持つ手