Implementing the app that detects people without a mask with Vision (Apple official framework)

MLBoy
5 min readJul 20, 2020

You can easily create an app that analyzes the camera frame of your iPad or iPhone and detects people who are not wearing a mask.
Please put it at the entrance of the shop.

Find human face

To determine if someone who is not wearing a mask in the camera frame , you just need to know if there is a face that have nose and mouth.

Vision

For that, you can use Apple’s Vision framework.
Vision includes the ability to detect human faces.
And if you’re wearing a mask, Vision can’t detect it as a face.
Therefore, if Vision says that the camera image has a face, then that face is unmasked.

Vision’s ability to detect faces is VNDetectFaceRectanglesRequest.
If you pass a camera frame to this request, it will return whether there is a human face in the image and its position.

Set as follows.

@discardableResult
func setupVision() -> NSError? {
// Setup Vision parts
let error: NSError! = nil
let detectFaceRequest:VNDetectFaceRectanglesRequest = {
let request = VNDetectFaceRectanglesRequest(completionHandler: { (request, error) in
DispatchQueue.main.async(execute: {
// perform all the UI updates on the main queue
if let results = request.results {
self.processVisionRequestResults(results)
}
})
})
request.revision = VNDetectFaceRectanglesRequestRevision2
return request
}()
self.requests = [faceCropRequest]

return error
}

Pass the camera image to Vision for analysis

Next, we will pass the image from the camera frame to this request frame by frame.
To use your iPad or iPhone camera, you can use AVFoundation.

import UIKit
import Foundation
import Vision
import AVFoundation

extension ViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
if isRequest {
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
return
}
currentBuffer = pixelBuffer

let exifOrientation = exifOrientationFromDeviceOrientation()
let imageRequestHandler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: exifOrientation, options: [:])
do {
try imageRequestHandler.perform(self.requests)
} catch {
print(error)
}
isRequest = false
}
}

func setupAVCapture() {
var deviceInput: AVCaptureDeviceInput!

// Select a video device, make an input
let videoDevice = AVCaptureDevice.DiscoverySession(deviceTypes: [.builtInWideAngleCamera], mediaType: .video, position: .front).devices.first
do {
deviceInput = try AVCaptureDeviceInput(device: videoDevice!)
} catch {
print("Could not create video device input: \(error)")
return
}

session.beginConfiguration()
session.sessionPreset = .vga640x480 // Model image size is smaller.

// Add a video input
guard session.canAddInput(deviceInput) else {
print("Could not add video device input to the session")
session.commitConfiguration()
return
}
session.addInput(deviceInput)
if session.canAddOutput(videoDataOutput) {
session.addOutput(videoDataOutput)
// Add a video data output
videoDataOutput.alwaysDiscardsLateVideoFrames = true
videoDataOutput.videoSettings = [kCVPixelBufferPixelFormatTypeKey as String: Int(kCVPixelFormatType_420YpCbCr8BiPlanarFullRange)]
videoDataOutput.setSampleBufferDelegate(self, queue: videoDataOutputQueue)
} else {
print("Could not add video data output to the session")
session.commitConfiguration()
return
}
let captureConnection = videoDataOutput.connection(with: .video)
captureConnection?.videoOrientation = .portrait
// Always process the frames
captureConnection?.isEnabled = true
do {
try videoDevice!.lockForConfiguration()
let dimensions = CMVideoFormatDescriptionGetDimensions((videoDevice?.activeFormat.formatDescription)!)
bufferSize.width = CGFloat(dimensions.height)
bufferSize.height = CGFloat(dimensions.width)
videoDevice!.unlockForConfiguration()
} catch {
print(error)
}
session.commitConfiguration()

setupVision()

// start the capture
startCaptureSession()
}

func startCaptureSession() {
session.startRunning()
}

public func exifOrientationFromDeviceOrientation() -> CGImagePropertyOrientation {
let curDeviceOrientation = UIDevice.current.orientation
let exifOrientation: CGImagePropertyOrientation

switch curDeviceOrientation {
case UIDeviceOrientation.portraitUpsideDown: // Device oriented vertically, home button on the top
exifOrientation = .left
case UIDeviceOrientation.landscapeLeft: // Device oriented horizontally, home button on the right
exifOrientation = .upMirrored
case UIDeviceOrientation.landscapeRight: // Device oriented horizontally, home button on the left
exifOrientation = .down
case UIDeviceOrientation.portrait: // Device oriented vertically, home button on the bottom
exifOrientation = .up
default:
exifOrientation = .up
}
return exifOrientation
}

@discardableResult
func setupVision() -> NSError? {
// Setup Vision parts
let error: NSError! = nil

let faceCropRequest:VNDetectFaceRectanglesRequest = {
let request = VNDetectFaceRectanglesRequest(completionHandler: { (request, error) in
DispatchQueue.main.async(execute: {
// perform all the UI updates on the main queue
if let results = request.results {
self.drawVisionRequestResults(results)
}
})
})
request.revision = VNDetectFaceRectanglesRequestRevision2
return request
}()

self.requests = [faceCropRequest]

return error
}

func processVisionRequestResults(_ results: [Any]) {
if currentBuffer != nil {
currentBuffer = nil
guard let observation = results.first as? VNFaceObservation else {
masking()
return
}
noMasking()
}
}
}

AVCaptureVideoDataOutputSampleBufferDelegate processes each camera frame, it passes the image of each frame to Vision’s face detection request.

Request processing is executed in the following part.

func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
if isRequest {
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
return
}
currentBuffer = pixelBuffer

let exifOrientation = exifOrientationFromDeviceOrientation()
let imageRequestHandler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: exifOrientation, options: [:])
do {
try imageRequestHandler.perform(self.requests)
} catch {
print(error)
}
isRequest = false
}
}

Process the result (whether there is an unmasked person)

There is a completion handler that is called when the request is complete.
Process the results in it.

let request = VNDetectFaceRectanglesRequest(completionHandler: { (request, error) in
DispatchQueue.main.async(execute: {
// perform all the UI updates on the main queue
if let results = request.results {
self.processVisionRequestResults(results)
}
})
})
request.revision = VNDetectFaceRectanglesRequestRevision2
return request
}()

If the image contains a face, the request result will have an observation that the image contained a face. This is VNFaceObservation.
If the result has this, then call the no-mask function. Because there is the face.

func processVisionRequestResults(_ results: [Any]) {
if currentBuffer != nil {
currentBuffer = nil
guard let observation = results.first as? VNFaceObservation else {
masking()
return
}
noMasking()
}
}

If someone don’t wear a mask…

else

With AVFoundation’s AVSpeechSynthesizer, you’ll hear a voice saying “Please wear a mask.”

var talker = AVSpeechSynthesizer()
var talking = false

func noMasking() {
if !talking {
let utterance = AVSpeechUtterance(string: NSLocalizedString("Please wear a mask.", comment: "") )
utterance.voice = AVSpeechSynthesisVoice(language: NSLocalizedString("en-US", comment: ""))
talker.speak(utterance)
talking = true
}

By the way, VNDetectFaceObservation also gives us the coordinates of the rectangle surrounding the face.
If you overlap the “not wearing mask” square warning on the camera image, the app will have a more monitoring taste.

Click here for the actual app

The app on GitHub.
https://github.com/john-rocky/MaskPlease

Follow me on Twitter. Please.
https://twitter.com/JackdeS11

Please send your work request to this email.
rockyshikoku@gmail.com

Also, clap your hand!

Chao 🐣!

--

--