HoloLens2 × Azure Cognitive Services（Speech SDKで音声認識）

投稿日 2021年1月19日
著者 azure-recipe-user
カテゴリー Azure
カテゴリー Cognitive Services
カテゴリー Microsoft HoloLens

はじめに

HoloLensアドベントカレンダー2020の10日目の記事です。
前回の続きで、エアタップして目の前の画像をキャプチャし、説明文を生成、日本語で読み上げているのですが、音声認識によってこれを動作させたいと思います。「ヨンシル、文字を読んで」「ヨンシル、何が見える？」

開発環境

Azure
- Computer Vision API (画像分析 API)
- Translator API
- Speech SDK 1.14.0
Unity 2019.4.1f1
MRTK 2.5.1
Windows 10 PC
HoloLens2

導入

1．前回の記事まで終わらせてください。

2．Unityプロジェクトはこんな感じ。エアタップはもう使わないので、前回の「TapToCaptureAnalyze」を非アクティブにしてください。代わりにMySpeechRecognizerを作成します。

3．MySpeechRecognizerにAudioSourceをAdd Componentします。

4．MySpeechRecognizerにTapToCaptureAnalyzeAPI.csをAdd Componentし、Audio SourceにMySpeechRecognizerをアタッチします。あと画像分析結果の画像となるQuadもアタッチしてください。

5．「MySpeechRecognizer.cs」スクリプトは、エアタップの代わりに音声認識してアクションするプログラムです。プログラムがスタートしたら音声認識を継続的に行います。まずWakeワードを認識し、「はい」と応答、その後Actionワードを認識するとTapToCaptureAnalyzeAPIのAirTap関数を実行します。

MySpeechRecognizer.cs
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Microsoft.CognitiveServices.Speech;

public class MySpeechRecognizer : MonoBehaviour
{
    private string recognizedString = "";
    private object threadLocker = new object();
    private SpeechRecognizer recognizer;
    private string fromLanguage = "ja-JP";

    public string WakeWord = "";
    public string ActionWord = "";
    public bool action = false;

   // Start is called before the first frame update
    void Start()
    {
        BeginRecognizing();
    }

    // Update is called once per frame
    async void Update()
    {
        if (recognizedString != "")
        {
            // Debug.Log(recognizedString);
            if (action){
                if (recognizedString.ToLower().Contains(ActionWord.ToLower()))
                {
                    Debug.Log("Analyze");
                    this.GetComponent().AirTap();
                    action = false;
                }
            }else if (recognizedString.ToLower().Contains(WakeWord.ToLower()))
            {
                Debug.Log("Wake");
                await this.GetComponent().SynthesizeAudioAsync("はい");
                action = true;
            }
        }
    }

    void OnDestroy()
    {
        if (recognizer != null)
        {
            recognizer.Dispose();
        }
    }

    public async void BeginRecognizing()
    {
        CreateSpeechRecognizer();

        if (recognizer != null)
        {
            await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);
            // recognizedString = "Say something...";
            Debug.Log("Say something...");
        }
    }

    void CreateSpeechRecognizer()
    {
        if (recognizer == null)
        {
            SpeechConfig config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
            config.SpeechRecognitionLanguage = fromLanguage;
            recognizer = new SpeechRecognizer(config);
            if (recognizer != null)
            {
                recognizer.Recognizing += RecognizingHandler;
                recognizer.Recognized += RecognizedHandler;
                recognizer.SpeechStartDetected += SpeechStartDetected;
                recognizer.SpeechEndDetected += SpeechEndDetectedHandler;
                recognizer.Canceled += CancelHandler;
                recognizer.SessionStarted += SessionStartedHandler;
                recognizer.SessionStopped += SessionStoppedHandler;
            }
        }
    }

    #region Speech Recognition Event Handlers
    private void SessionStartedHandler(object sender, SessionEventArgs e)
    {
    }

    private void SessionStoppedHandler(object sender, SessionEventArgs e)
    {
        recognizer = null;
    }

    private void RecognizingHandler(object sender, SpeechRecognitionEventArgs e)
    {
        if (e.Result.Reason == ResultReason.RecognizingSpeech)
        {
            lock (threadLocker)
            {
                recognizedString = $"{e.Result.Text}";
                Debug.Log(recognizedString);
            }
        }
    }

    private void RecognizedHandler(object sender, SpeechRecognitionEventArgs e)
    {
        if (e.Result.Reason == ResultReason.RecognizedSpeech)
        {
            lock (threadLocker)
            {
                recognizedString = $"{e.Result.Text}";
                Debug.Log(recognizedString);
            }
        }
        else if (e.Result.Reason == ResultReason.NoMatch)
        {
        }
    }

    private void SpeechStartDetected(object sender, RecognitionEventArgs e)
    {
    }

    private void SpeechEndDetectedHandler(object sender, RecognitionEventArgs e)
    {
    }

    private void CancelHandler(object sender, RecognitionEventArgs e)
    {
    }
    #endregion
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

MySpeechRecognizer.cs

using System.Collections;

using System.Collections.Generic;

using UnityEngine;

using Microsoft.CognitiveServices.Speech;

public class MySpeechRecognizer : MonoBehaviour

{

private string recognizedString = "";

private object threadLocker = new object();

private SpeechRecognizer recognizer;

private string fromLanguage = "ja-JP";

public string WakeWord = "";

public string ActionWord = "";

public bool action = false;

// Start is called before the first frame update

void Start()

{

BeginRecognizing();

}

// Update is called once per frame

async void Update()

{

if (recognizedString != "")

{

// Debug.Log(recognizedString);

if (action){

if (recognizedString.ToLower().Contains(ActionWord.ToLower()))

{

Debug.Log("Analyze");

this.GetComponent().AirTap();

action = false;

}

}else if (recognizedString.ToLower().Contains(WakeWord.ToLower()))

{

Debug.Log("Wake");

await this.GetComponent().SynthesizeAudioAsync("はい");

action = true;

}

void OnDestroy()

{

if (recognizer != null)

{

recognizer.Dispose();

}

public async void BeginRecognizing()

{

CreateSpeechRecognizer();

if (recognizer != null)

{

await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

// recognizedString = "Say something...";

Debug.Log("Say something...");

}

void CreateSpeechRecognizer()

{

if (recognizer == null)

{

SpeechConfig config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");

config.SpeechRecognitionLanguage = fromLanguage;

recognizer = new SpeechRecognizer(config);

if (recognizer != null)

{

recognizer.Recognizing += RecognizingHandler;

recognizer.Recognized += RecognizedHandler;

recognizer.SpeechStartDetected += SpeechStartDetected;

recognizer.SpeechEndDetected += SpeechEndDetectedHandler;

recognizer.Canceled += CancelHandler;

recognizer.SessionStarted += SessionStartedHandler;

recognizer.SessionStopped += SessionStoppedHandler;

}

#region Speech Recognition Event Handlers

private void SessionStartedHandler(object sender, SessionEventArgs e)

{

}

private void SessionStoppedHandler(object sender, SessionEventArgs e)

{

recognizer = null;

}

private void RecognizingHandler(object sender, SpeechRecognitionEventArgs e)

{

if (e.Result.Reason == ResultReason.RecognizingSpeech)

{

lock (threadLocker)

{

recognizedString = $"{e.Result.Text}";

Debug.Log(recognizedString);

}

private void RecognizedHandler(object sender, SpeechRecognitionEventArgs e)

{

if (e.Result.Reason == ResultReason.RecognizedSpeech)

{

lock (threadLocker)

{

recognizedString = $"{e.Result.Text}";

Debug.Log(recognizedString);

}

else if (e.Result.Reason == ResultReason.NoMatch)

{

}

private void SpeechStartDetected(object sender, RecognitionEventArgs e)

{

}

private void SpeechEndDetectedHandler(object sender, RecognitionEventArgs e)

{

}

private void CancelHandler(object sender, RecognitionEventArgs e)

{

}

#endregion

}

6．”YourSubscriptionKey”, “YourServiceRegion”にAzureの音声リソースからキーと場所（リージョン）をコピペしてください。

7．fromLanguageに”ja-JP”（日本語）を指定しています。

8．TapToCaptureAnalyzeAPI.csのasync Task SynthesizeAudioAsync(string text) 関数をpublicにします。MySpeechRecognizer.csからWakeワードを認識したら、「はい」と喋らせるためです。

9．Wakeワードに「ヨンシル」、Actionワードに「何が見える」を設定しました

実行

Edito上でも動くので実行してみてください。継続的に音声認識したテキストがコンソールに表示されます。

HoloLens2で実行した動画が以下になります。

お疲れ様でした。

参考

1．音声認識と文字起こしの統合と使用
 2．音声認識を使用したコマンドの実行

MRTK.HoloLens2.Unity.Tutorials.Assets.GettingStarted.2.3.0.3.unitypackage
MRTK.HoloLens2.Unity.Tutorials.Assets.AzureSpeechServices.2.3.0.0.unitypackage

この記事を書いた人

azure-recipe-user

記事一覧

HoloLens2 × Azure Cognitive Services（Speech SDKで音声認識）

はじめに

開発環境

導入

実行

参考

この記事を書いた人

azure-recipe-user

Cognitive Services: OCR機能で名刺読み取りプログラムを作ってみた

AzureSynapseAnalyticsのマッピングデータフローで遊んでみた

【Azure AutoML】Python SDK V1 / V2で学習モデルのmetricsを取得する方法

Cognitive Services: Face APIについて