HoloLens2 × Azure Cognitive Services（Read APIで文字認識し読み上げ）

投稿日 2021年1月19日
著者 azure-recipe-user
カテゴリー Azure
カテゴリー Cognitive Services
カテゴリー Microsoft HoloLens

はじめに

HoloLensアドベントカレンダー2020の13日目の記事です。
前回は「何が見える？」と質問したら、画像をキャプチャして説明文を生成し、読み上げを行いました。今回は、「文字を読んで」と言うと、画像からテキストを抽出し、読み上げてみます。「ヨンシル、文字を読んで」

開発環境

Azure
- Computer Vision API (Read API)
- Speech SDK 1.14.0
Unity 2019.4.1f1
MRTK 2.5.1
Windows 10 PC
HoloLens2

導入

1．前回の記事まで終わらせてください。

2．Unityプロジェクトはこんな感じ。前回のMySpeechRecognizerのActionワードをリストにして、「文字を読んで」を追加します。新しく「TapToCaptureReadAPI.cs」スクリプトをAdd Componentし、「文字を読んで」と言うと画像キャプチャし、テキスト抽出、読み上げという流れになります。

3．前回の記事のMySpeechRecognizer.csは、Actionワードが一つしか認識できませんでしたが、リストにして複数のActionワードを認識できるようにします。あとはUpdate関数を下記のように編集して、Actionワードが「文字を読んで」のときにTapToCaptureReadAPI.csのAirTap関数を実行します。

MySpeechRecognizer.cs
    // public string ActionWord = "";
    public List ActionWords;

    async void Update()
    {
        if (recognizedString != "")
        {
            // Debug.Log(recognizedString);
            if (action){
                foreach(string ActionWord in ActionWords){
                    if (recognizedString.ToLower().Contains(ActionWord.ToLower()))
                    {
                        Debug.Log("Action");
                        if(ActionWord == "何が見える"){
                            Debug.Log("Analyze Image");
                            this.GetComponent().AirTap();
                        }else if(ActionWord == "文字を読んで"){
                            Debug.Log("Read");
                            this.GetComponent().AirTap();
                        }
                        action = false;
                    }
                }
            }else if (recognizedString.ToLower().Contains(WakeWord.ToLower()))
            {
                Debug.Log("Wake");
                await this.GetComponent().SynthesizeAudioAsync("はい");
                action = true;
            }
        }
    }

MySpeechRecognizer.cs

// public string ActionWord = "";

public List ActionWords;

async void Update()

{

if (recognizedString != "")

{

// Debug.Log(recognizedString);

if (action){

foreach(string ActionWord in ActionWords){

if (recognizedString.ToLower().Contains(ActionWord.ToLower()))

{

Debug.Log("Action");

if(ActionWord == "何が見える"){

Debug.Log("Analyze Image");

this.GetComponent().AirTap();

}else if(ActionWord == "文字を読んで"){

Debug.Log("Read");

this.GetComponent().AirTap();

}

action = false;

}

}else if (recognizedString.ToLower().Contains(WakeWord.ToLower()))

{

Debug.Log("Wake");

await this.GetComponent().SynthesizeAudioAsync("はい");

action = true;

}

4．「TapToCaptureReadAPI.cs」スクリプトは以下のようになります。

TapToCaptureReadAPI.cs
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System;
using UnityEngine;
using Microsoft.MixedReality.Toolkit.Utilities;
using System.Threading.Tasks;

// SpeechSDK ここから
using System.IO;
using System.Text;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
// SpeechSDK ここまで

public class TapToCaptureReadAPI : MonoBehaviour
{
    // Read ここから
    private string read_endpoint = "https:///vision/v3.1/read/analyze";
    private string read_subscription_key = "";

    // {"status":"succeeded","createdDateTime":"2020-12-13T03:05:57Z","lastUpdatedDateTime":"2020-12-13T03:05:58Z","analyzeResult":{"version":"3.0.0","readResults":[{"page":1,"angle":-1.4203,"width":1280,"height":720,"unit":"pixel","lines":[{"boundingBox":[417,351,481,348,482,365,417,367],"text":"BANANA","words":[{"boundingBox":[420,352,481,348,482,365,421,367],"text":"BANANA","confidence":0.595}]},{"boundingBox":[541,348,604,347,605,360,542,362],"text":"ALMOND","words":[{"boundingBox":[542,349,605,347,605,360,543,363],"text":"ALMOND","confidence":0.916}]},{"boundingBox":[644,345,724,344,724,358,645,359],"text":"STRAWBERRY","words":[{"boundingBox":[647,346,724,344,724,358,647,359],"text":"STRAWBERRY","confidence":0.638}]},{"boundingBox":[757,334,843,337,843,356,757,353],"text":"PINEAPPLE","words":[{"boundingBox":[763,335,842,338,843,355,763,353],"text":"PINEAPPLE","confidence":0.623}]},{"boundingBox":[385,382,748,374,750,465,387,475],"text":"LOOK","words":[{"boundingBox":[385,384,746,374,746,467,387,475],"text":"LOOK","confidence":0.983}]},{"boundingBox":[479,474,668,468,670,501,480,508],"text":"A La Mode","words":[{"boundingBox":[482,475,512,474,513,508,483,509],"text":"A","confidence":0.987},{"boundingBox":[519,474,567,472,567,506,520,508],"text":"La","confidence":0.987},{"boundingBox":[573,472,669,468,670,501,574,505],"text":"Mode","confidence":0.984}]},{"boundingBox":[489,508,666,501,666,518,490,526],"text":"CHOCOLATE","words":[{"boundingBox":[489,509,666,501,666,519,490,527],"text":"CHOCOLATE","confidence":0.981}]}]}]}}
    [System.Serializable]
    public class Read
    {
        public string status;
        public string createdDateTime;
        public string lastUpdatedDateTime;
        public AnalyzeResult analyzeResult;
    }

    [System.Serializable]
    public class AnalyzeResult
    {
        public ReadResults[] readResults;
    }

    [System.Serializable]
    public class ReadResults
    {
        public Lines[] lines;
    }

    [System.Serializable]
    public class Lines
    {
        public string text;
    }
    // Read ここまで

    // SpeechSDK ここから
    public AudioSource audioSource;

    public async Task SynthesizeAudioAsync(string text) 
    {
        var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
        var synthesizer = new SpeechSynthesizer(config, null); // nullを省略するとPCのスピーカーから出力されるが、HoloLensでは出力されない。

        string ssml = " " + text + " ";

        // Starts speech synthesis, and returns after a single utterance is synthesized.
        // using (var result = synthesizer.SpeakTextAsync(text).Result)
        using (var result = synthesizer.SpeakSsmlAsync(ssml).Result)
        {
            // Checks result.
            if (result.Reason == ResultReason.SynthesizingAudioCompleted)
            {
                // Native playback is not supported on Unity yet (currently only supported on Windows/Linux Desktop).
                // Use the Unity API to play audio here as a short term solution.
                // Native playback support will be added in the future release.
                var sampleCount = result.AudioData.Length / 2;
                var audioData = new float[sampleCount];
                for (var i = 0; i &lt; sampleCount; ++i)
                {
                    audioData[i] = (short)(result.AudioData[i * 2 + 1] &lt;&lt; 8 | result.AudioData[i * 2]) / 32768.0F; } // The output audio format is 16K 16bit mono var audioClip = AudioClip.Create("SynthesizedAudio", sampleCount, 1, 16000, false); audioClip.SetData(audioData, 0); audioSource.clip = audioClip; audioSource.Play(); // newMessage = "Speech synthesis succeeded!"; } else if (result.Reason == ResultReason.Canceled) { var cancellation = SpeechSynthesisCancellationDetails.FromResult(result); // newMessage = $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?"; } } } // SpeechSDK ここまで public GameObject quad; UnityEngine.Windows.WebCam.PhotoCapture photoCaptureObject = null; Texture2D targetTexture = null; private bool waitingForCapture; void Start(){ waitingForCapture = false; } public void AirTap() { if (waitingForCapture) return; waitingForCapture = true; Resolution cameraResolution = UnityEngine.Windows.WebCam.PhotoCapture.SupportedResolutions.OrderByDescending((res) =&gt; res.width * res.height).First();
        targetTexture = new Texture2D(cameraResolution.width, cameraResolution.height);

        // PhotoCapture オブジェクトを作成します
        UnityEngine.Windows.WebCam.PhotoCapture.CreateAsync(false, delegate (UnityEngine.Windows.WebCam.PhotoCapture captureObject) {
            photoCaptureObject = captureObject;
            UnityEngine.Windows.WebCam.CameraParameters cameraParameters = new UnityEngine.Windows.WebCam.CameraParameters();
            cameraParameters.hologramOpacity = 0.0f;
            cameraParameters.cameraResolutionWidth = cameraResolution.width;
            cameraParameters.cameraResolutionHeight = cameraResolution.height;
            cameraParameters.pixelFormat = UnityEngine.Windows.WebCam.CapturePixelFormat.BGRA32;

            // カメラをアクティベートします
            photoCaptureObject.StartPhotoModeAsync(cameraParameters, delegate (UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result) {
                // 写真を撮ります
                photoCaptureObject.TakePhotoAsync(OnCapturedPhotoToMemoryAsync);
            });
        });
    }

    async void OnCapturedPhotoToMemoryAsync(UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result, UnityEngine.Windows.WebCam.PhotoCaptureFrame photoCaptureFrame)
    {
        // ターゲットテクスチャに RAW 画像データをコピーします
        photoCaptureFrame.UploadImageDataToTexture(targetTexture);
        byte[] bodyData = targetTexture.EncodeToJPG();

        Response response = new Response();
        Dictionary&lt;string, string&gt; headers = new Dictionary&lt;string, string&gt;();
        headers.Add("Ocp-Apim-Subscription-Key", read_subscription_key);

        try
        {
            string query = read_endpoint;
            // headers.Add("Content-Type": "application/octet-stream");
            response = await Rest.PostAsync(query, bodyData, headers, -1, true);
        }
        catch (Exception e)
        {
            photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);
            return;
        }

        if (!response.Successful)
        {
            photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);
            return;
        }

        Debug.Log(response.ResponseCode);
        Debug.Log(response.ResponseBody);

        string operation_url = response.ResponseBody;
        bool poll = true;
        int i = 0;
        Read read;

        do {
            System.Threading.Thread.Sleep(1000);
            // https://docs.microsoft.com/ja-jp/azure/cognitive-services/computer-vision/concept-recognizing-text
            // https://github.com/Azure-Samples/cognitive-services-quickstart-code/blob/master/dotnet/ComputerVision/REST/CSharp-hand-text.md
            // {"status":"running","createdDateTime":"2020-12-12T19:39:44Z","lastUpdatedDateTime":"2020-12-12T19:39:44Z"}
            response = await Rest.GetAsync(operation_url, headers, -1, null, true);
            Debug.Log(response.ResponseCode);
            Debug.Log(response.ResponseBody);
            read = JsonUtility.FromJson(response.ResponseBody);
            if (read.status == "succeeded"){
                poll = false;
            }
            ++i;
        } while (i &lt; 60 &amp;&amp; poll);

        if (i == 60 &amp;&amp; poll == true)
        {
            photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);
            return;
        }

        string read_text = "";
        foreach (ReadResults readResult in read.analyzeResult.readResults){
            foreach (Lines line in readResult.lines){
                read_text = read_text + line.text + "\n";
                // Debug.Log(line.text);
            }
        }
        read_text += "以上です。";

        // SpeechSDK 追加分ここから
        Debug.Log(read_text);
        await SynthesizeAudioAsync(read_text); // jp 
        // SpeechSDK 追加分ここまで

        // カメラを非アクティブにします
        photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);
    }

    void OnStoppedPhotoMode(UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result)
    {
        // photo capture のリソースをシャットダウンします
        photoCaptureObject.Dispose();
        photoCaptureObject = null;
        waitingForCapture = false;
    }
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

TapToCaptureReadAPI.cs

using System.Collections;

using System.Collections.Generic;

using System.Linq;

using System;

using UnityEngine;

using Microsoft.MixedReality.Toolkit.Utilities;

using System.Threading.Tasks;

// SpeechSDK ここから

using System.IO;

using System.Text;

using Microsoft.CognitiveServices.Speech;

using Microsoft.CognitiveServices.Speech.Audio;

// SpeechSDK ここまで

public class TapToCaptureReadAPI : MonoBehaviour

{

// Read ここから

private string read_endpoint = "https:///vision/v3.1/read/analyze";

private string read_subscription_key = "";

// {"status":"succeeded","createdDateTime":"2020-12-13T03:05:57Z","lastUpdatedDateTime":"2020-12-13T03:05:58Z","analyzeResult":{"version":"3.0.0","readResults":[{"page":1,"angle":-1.4203,"width":1280,"height":720,"unit":"pixel","lines":[{"boundingBox":[417,351,481,348,482,365,417,367],"text":"BANANA","words":[{"boundingBox":[420,352,481,348,482,365,421,367],"text":"BANANA","confidence":0.595}]},{"boundingBox":[541,348,604,347,605,360,542,362],"text":"ALMOND","words":[{"boundingBox":[542,349,605,347,605,360,543,363],"text":"ALMOND","confidence":0.916}]},{"boundingBox":[644,345,724,344,724,358,645,359],"text":"STRAWBERRY","words":[{"boundingBox":[647,346,724,344,724,358,647,359],"text":"STRAWBERRY","confidence":0.638}]},{"boundingBox":[757,334,843,337,843,356,757,353],"text":"PINEAPPLE","words":[{"boundingBox":[763,335,842,338,843,355,763,353],"text":"PINEAPPLE","confidence":0.623}]},{"boundingBox":[385,382,748,374,750,465,387,475],"text":"LOOK","words":[{"boundingBox":[385,384,746,374,746,467,387,475],"text":"LOOK","confidence":0.983}]},{"boundingBox":[479,474,668,468,670,501,480,508],"text":"A La Mode","words":[{"boundingBox":[482,475,512,474,513,508,483,509],"text":"A","confidence":0.987},{"boundingBox":[519,474,567,472,567,506,520,508],"text":"La","confidence":0.987},{"boundingBox":[573,472,669,468,670,501,574,505],"text":"Mode","confidence":0.984}]},{"boundingBox":[489,508,666,501,666,518,490,526],"text":"CHOCOLATE","words":[{"boundingBox":[489,509,666,501,666,519,490,527],"text":"CHOCOLATE","confidence":0.981}]}]}]}}

[System.Serializable]

public class Read

{

public string status;

public string createdDateTime;

public string lastUpdatedDateTime;

public AnalyzeResult analyzeResult;

}

[System.Serializable]

public class AnalyzeResult

{

public ReadResults[] readResults;

}

[System.Serializable]

public class ReadResults

{

public Lines[] lines;

}

[System.Serializable]

public class Lines

{

public string text;

}

// Read ここまで

// SpeechSDK ここから

public AudioSource audioSource;

public async Task SynthesizeAudioAsync(string text)

{

var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");

var synthesizer = new SpeechSynthesizer(config, null); // nullを省略するとPCのスピーカーから出力されるが、HoloLensでは出力されない。

string ssml = " " + text + " ";

// Starts speech synthesis, and returns after a single utterance is synthesized.

// using (var result = synthesizer.SpeakTextAsync(text).Result)

using (var result = synthesizer.SpeakSsmlAsync(ssml).Result)

{

// Checks result.

if (result.Reason == ResultReason.SynthesizingAudioCompleted)

{

// Native playback is not supported on Unity yet (currently only supported on Windows/Linux Desktop).

// Use the Unity API to play audio here as a short term solution.

// Native playback support will be added in the future release.

var sampleCount = result.AudioData.Length / 2;

var audioData = new float[sampleCount];

for (var i = 0; i < sampleCount; ++i)

{

audioData[i] = (short)(result.AudioData[i * 2 + 1] << 8 | result.AudioData[i * 2]) / 32768.0F; } // The output audio format is 16K 16bit mono var audioClip = AudioClip.Create("SynthesizedAudio", sampleCount, 1, 16000, false); audioClip.SetData(audioData, 0); audioSource.clip = audioClip; audioSource.Play(); // newMessage = "Speech synthesis succeeded!"; } else if (result.Reason == ResultReason.Canceled) { var cancellation = SpeechSynthesisCancellationDetails.FromResult(result); // newMessage = $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?"; } } } // SpeechSDK ここまで public GameObject quad; UnityEngine.Windows.WebCam.PhotoCapture photoCaptureObject = null; Texture2D targetTexture = null; private bool waitingForCapture; void Start(){ waitingForCapture = false; } public void AirTap() { if (waitingForCapture) return; waitingForCapture = true; Resolution cameraResolution = UnityEngine.Windows.WebCam.PhotoCapture.SupportedResolutions.OrderByDescending((res) => res.width * res.height).First();

targetTexture = new Texture2D(cameraResolution.width, cameraResolution.height);

// PhotoCapture オブジェクトを作成します

UnityEngine.Windows.WebCam.PhotoCapture.CreateAsync(false, delegate (UnityEngine.Windows.WebCam.PhotoCapture captureObject) {

photoCaptureObject = captureObject;

UnityEngine.Windows.WebCam.CameraParameters cameraParameters = new UnityEngine.Windows.WebCam.CameraParameters();

cameraParameters.hologramOpacity = 0.0f;

cameraParameters.cameraResolutionWidth = cameraResolution.width;

cameraParameters.cameraResolutionHeight = cameraResolution.height;

cameraParameters.pixelFormat = UnityEngine.Windows.WebCam.CapturePixelFormat.BGRA32;

// カメラをアクティベートします

photoCaptureObject.StartPhotoModeAsync(cameraParameters, delegate (UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result) {

// 写真を撮ります

photoCaptureObject.TakePhotoAsync(OnCapturedPhotoToMemoryAsync);

});

}

async void OnCapturedPhotoToMemoryAsync(UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result, UnityEngine.Windows.WebCam.PhotoCaptureFrame photoCaptureFrame)

{

// ターゲットテクスチャに RAW 画像データをコピーします

photoCaptureFrame.UploadImageDataToTexture(targetTexture);

byte[] bodyData = targetTexture.EncodeToJPG();

Response response = new Response();

Dictionary<string, string> headers = new Dictionary<string, string>();

headers.Add("Ocp-Apim-Subscription-Key", read_subscription_key);

try

{

string query = read_endpoint;

// headers.Add("Content-Type": "application/octet-stream");

response = await Rest.PostAsync(query, bodyData, headers, -1, true);

}

catch (Exception e)

{

photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);

return;

}

if (!response.Successful)

{

photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);

return;

}

Debug.Log(response.ResponseCode);

Debug.Log(response.ResponseBody);

string operation_url = response.ResponseBody;

bool poll = true;

int i = 0;

Read read;

do {

System.Threading.Thread.Sleep(1000);

// https://docs.microsoft.com/ja-jp/azure/cognitive-services/computer-vision/concept-recognizing-text

// https://github.com/Azure-Samples/cognitive-services-quickstart-code/blob/master/dotnet/ComputerVision/REST/CSharp-hand-text.md

// {"status":"running","createdDateTime":"2020-12-12T19:39:44Z","lastUpdatedDateTime":"2020-12-12T19:39:44Z"}

response = await Rest.GetAsync(operation_url, headers, -1, null, true);

Debug.Log(response.ResponseCode);

Debug.Log(response.ResponseBody);

read = JsonUtility.FromJson(response.ResponseBody);

if (read.status == "succeeded"){

poll = false;

}

++i;

} while (i < 60 && poll);

if (i == 60 && poll == true)

{

photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);

return;

}

string read_text = "";

foreach (ReadResults readResult in read.analyzeResult.readResults){

foreach (Lines line in readResult.lines){

read_text = read_text + line.text + "\n";

// Debug.Log(line.text);

}

read_text += "以上です。";

// SpeechSDK 追加分ここから

Debug.Log(read_text);

await SynthesizeAudioAsync(read_text); // jp

// SpeechSDK 追加分ここまで

// カメラを非アクティブにします

photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);

}

void OnStoppedPhotoMode(UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result)

{

// photo capture のリソースをシャットダウンします

photoCaptureObject.Dispose();

photoCaptureObject = null;

waitingForCapture = false;

}

5．Computer Vision API (Read API) のエンドポイントとキーは、画像説明文生成のときと同じものを用います。

6．TapToCaptureReadAPI.csのAirTap関数で画像をキャプチャしたあと、Read APIに画像をPOSTします。すると、ResponseのHeadersに{“Operation-Location”:URL}が返ってくるので、そのURLに対してGETする必要があります。しかし、MRTKのRest.csはResponseHeadersを返してくれないので、ProcessRequestAsync関数の275行目からを次のように編集します。

Rest.cs
・・・
            if (readResponseData)
            {
                // For Read API  
                Dictionary&lt;string, string&gt; tmp = webRequest.GetResponseHeaders();
                if (tmp == null)
                {
                    return new Response(true, webRequest.downloadHandler?.text, webRequest.downloadHandler?.data, webRequest.responseCode);
                }else{
                    if(tmp.ContainsKey("Operation-Location")){
                        string responseHeaders = tmp["Operation-Location"];
                        string downloadHandlerText = webRequest.downloadHandler?.text;
                        return new Response(true, responseHeaders, webRequest.downloadHandler?.data, webRequest.responseCode);
                    }else{
                        return new Response(true, webRequest.downloadHandler?.text, webRequest.downloadHandler?.data, webRequest.responseCode);
                    }
                }
            }
            else // This option can be used only if action will be triggered in the same scope as the webrequest
            {
                return new Response(true, () =&gt; webRequest.downloadHandler?.text, () =&gt; webRequest.downloadHandler?.data, webRequest.responseCode);
            }
・・・

Rest.cs

・・・

if (readResponseData)

{

// For Read API

Dictionary<string, string> tmp = webRequest.GetResponseHeaders();

if (tmp == null)

{

return new Response(true, webRequest.downloadHandler?.text, webRequest.downloadHandler?.data, webRequest.responseCode);

}else{

if(tmp.ContainsKey("Operation-Location")){

string responseHeaders = tmp["Operation-Location"];

string downloadHandlerText = webRequest.downloadHandler?.text;

return new Response(true, responseHeaders, webRequest.downloadHandler?.data, webRequest.responseCode);

}else{

return new Response(true, webRequest.downloadHandler?.text, webRequest.downloadHandler?.data, webRequest.responseCode);

}

else // This option can be used only if action will be triggered in the same scope as the webrequest

{

return new Response(true, () => webRequest.downloadHandler?.text, () => webRequest.downloadHandler?.data, webRequest.responseCode);

}

・・・

7．これでResponseBodyにResponseHeadersのURLが返ってくるので、そのURLに対してGETします。

8．テキスト抽出が実行中の場合は下記のようなjsonが返ってくるので、”status”が”succeeded”になるまで、1秒おきにGETします。

{"status":"running","createdDateTime":"2020-12-12T19:39:44Z","lastUpdatedDateTime":"2020-12-12T19:39:44Z"}

				1

						{"status":"running","createdDateTime":"2020-12-12T19:39:44Z","lastUpdatedDateTime":"2020-12-12T19:39:44Z"}

9．”status”が”succeeded”になったら、テキスト抽出結果が次のようなjsonで返っくるので、仕様に合わせてReadクラス、AnalyzeResultクラス、ReadResultsクラス、Linesクラスを作成しました。

{"status":"succeeded","createdDateTime":"2020-12-13T03:05:57Z","lastUpdatedDateTime":"2020-12-13T03:05:58Z","analyzeResult":{"version":"3.0.0","readResults":[{"page":1,"angle":-1.4203,"width":1280,"height":720,"unit":"pixel","lines":[{"boundingBox":[417,351,481,348,482,365,417,367],"text":"BANANA","words":[{"boundingBox":[420,352,481,348,482,365,421,367],"text":"BANANA","confidence":0.595}]},{"boundingBox":[541,348,604,347,605,360,542,362],"text":"ALMOND","words":[{"boundingBox":[542,349,605,347,605,360,543,363],"text":"ALMOND","confidence":0.916}]},{"boundingBox":[644,345,724,344,724,358,645,359],"text":"STRAWBERRY","words":[{"boundingBox":[647,346,724,344,724,358,647,359],"text":"STRAWBERRY","confidence":0.638}]},{"boundingBox":[757,334,843,337,843,356,757,353],"text":"PINEAPPLE","words":[{"boundingBox":[763,335,842,338,843,355,763,353],"text":"PINEAPPLE","confidence":0.623}]},{"boundingBox":[385,382,748,374,750,465,387,475],"text":"LOOK","words":[{"boundingBox":[385,384,746,374,746,467,387,475],"text":"LOOK","confidence":0.983}]},{"boundingBox":[479,474,668,468,670,501,480,508],"text":"A La Mode","words":[{"boundingBox":[482,475,512,474,513,508,483,509],"text":"A","confidence":0.987},{"boundingBox":[519,474,567,472,567,506,520,508],"text":"La","confidence":0.987},{"boundingBox":[573,472,669,468,670,501,574,505],"text":"Mode","confidence":0.984}]},{"boundingBox":[489,508,666,501,666,518,490,526],"text":"CHOCOLATE","words":[{"boundingBox":[489,509,666,501,666,519,490,527],"text":"CHOCOLATE","confidence":0.981}]}]}]}}

				1

						{"status":"succeeded","createdDateTime":"2020-12-13T03:05:57Z","lastUpdatedDateTime":"2020-12-13T03:05:58Z","analyzeResult":{"version":"3.0.0","readResults":[{"page":1,"angle":-1.4203,"width":1280,"height":720,"unit":"pixel","lines":[{"boundingBox":[417,351,481,348,482,365,417,367],"text":"BANANA","words":[{"boundingBox":[420,352,481,348,482,365,421,367],"text":"BANANA","confidence":0.595}]},{"boundingBox":[541,348,604,347,605,360,542,362],"text":"ALMOND","words":[{"boundingBox":[542,349,605,347,605,360,543,363],"text":"ALMOND","confidence":0.916}]},{"boundingBox":[644,345,724,344,724,358,645,359],"text":"STRAWBERRY","words":[{"boundingBox":[647,346,724,344,724,358,647,359],"text":"STRAWBERRY","confidence":0.638}]},{"boundingBox":[757,334,843,337,843,356,757,353],"text":"PINEAPPLE","words":[{"boundingBox":[763,335,842,338,843,355,763,353],"text":"PINEAPPLE","confidence":0.623}]},{"boundingBox":[385,382,748,374,750,465,387,475],"text":"LOOK","words":[{"boundingBox":[385,384,746,374,746,467,387,475],"text":"LOOK","confidence":0.983}]},{"boundingBox":[479,474,668,468,670,501,480,508],"text":"A La Mode","words":[{"boundingBox":[482,475,512,474,513,508,483,509],"text":"A","confidence":0.987},{"boundingBox":[519,474,567,472,567,506,520,508],"text":"La","confidence":0.987},{"boundingBox":[573,472,669,468,670,501,574,505],"text":"Mode","confidence":0.984}]},{"boundingBox":[489,508,666,501,666,518,490,526],"text":"CHOCOLATE","words":[{"boundingBox":[489,509,666,501,666,519,490,527],"text":"CHOCOLATE","confidence":0.981}]}]}]}}

10．テキスト抽出結果を音声合成に投げて読み上げます。

実行

実行動画を見てください。こんな感じで文字を読めるようになりました！割と小さい文字もいけます。名刺とかも読めるし、便利かもしれないです。日本語対応はまだなので待つしかないですね。（現在の対応言語：Dutch, English, French, German, Italian, Portuguese and Spanish）

ちなみにWakeワードの「ヨンシル」は「4種類」や「キャンセル」などに誤認しやすいので、変えた方がいいです。「ガチモト」は「が地元」や「合致もっと」になるので、「藤本」がいいです。。

参考

この記事を書いた人

azure-recipe-user

記事一覧

HoloLens2 × Azure Cognitive Services（Read APIで文字認識し読み上げ）

はじめに

開発環境

導入

実行

参考

この記事を書いた人

azure-recipe-user

HoloLens2 × Azure Cognitive Services（Face APIで顔検出）

Azure MLでKaggleのTPS AUG 2022をやってみた

Azure初心者がLogic Appsについて調べてみた

【Azure 初心者向け】Azure VM を利用してファイルサーバ構築を検討する