varys-thesis/main.typ

#import "template.typ": *
#import "@preview/glossarium:0.2.6": make-glossary, print-glossary, gls, glspl
#show: make-glossary
#import "@preview/fletcher:0.4.1" as fletcher: node, edge, utils
#import "@preview/cetz:0.2.0"
#import "@preview/lovelace:0.2.0": *
#show: setup-lovelace

#set text(region: "GB")

// constants
// colours from https://mk.bcgsc.ca/colorblind/palettes.mhtml
#let colour_network = rgb("2271b2")
#let colour_network_transparent = rgb(34, 113, 178, 50%)
#let colour_audio = rgb("e69f00")
#let colour_database = rgb("f748a5")
#let colour_analysis = rgb("359b73")
#let colour_highlight = rgb("d55e00")

// replacements
#show "varys-network": name => box[#text(colour_network)[#raw(name.text.at(0))#raw(name.text.slice(1))]]
#show "varys-audio": name => box[#text(colour_audio)[#raw(name.text.at(0))#raw(name.text.slice(1))]]
#show "varys-database": name => box[#text(colour_database)[#raw(name.text.at(0))#raw(name.text.slice(1))]]
#show "varys-analysis": name => box[#text(colour_analysis)[#raw(name.text.at(0))#raw(name.text.slice(1))]] // the at(0)/slice(1) trick prevents an infinite recursion

// diagrams
#let diagram(label, caption, diagram) = {
  set text(size: 10pt)
  [#figure(
    diagram,
    caption: caption,
    kind: "diagram",
    supplement: [Diagram],
  ) #label]
}
#let edge_(..args) = edge(..args, marks: (none, "|>")) // we can't use set rules for user defined functions yet, so we overwrite it
#let group_node(colour, ..args) = node(
  ..args,
  stroke: colour,
  fill: colour.lighten(80%),
)
#let group_edge(colour, description, ..args) = edge(
  ..args,
  text(colour)[#description],
  "..",
  stroke: colour,
  label-side: left,
)
#let enclose_nodes(nodes, colour, clearance: (8pt, 8pt, 8pt, 8pt)) = {
  let (center, size) = fletcher.bounding-rect(nodes.map(node => node.real-pos))
  center.at(0) = center.at(0) - (clearance.at(3) - clearance.at(1))
  center.at(1) = center.at(1) - (clearance.at(2) - clearance.at(0))

  cetz.draw.content(
    center,
    rect(
      width: size.at(0) + clearance.at(1) + clearance.at(3),
      height: size.at(1) + clearance.at(2) + clearance.at(0),
      radius: 16pt,
      stroke: colour,
      fill: colour.lighten(85%),
    )
  )
}

// ml charts
#let test = csv("csv/ml/test.csv").map(
  item => (int(item.at(1)), int(item.at(2)), float(item.at(3)))
)
#let ml-chart(id, label-count, title, size: (6, 5)) = {
  cetz.canvas({
    import cetz.draw: *
    import cetz.plot

    let data = csv("csv/ml/" + str(id) + "-train.csv").map(item => (int(item.at(0)), float(item.at(1)), float(item.at(2))))
    let accuracy = data.map(item => (item.at(0), item.at(1)))
    let loss = data.map(item => (item.at(0), item.at(2)))
    let y-min = calc.round(100/label-count)

    plot.plot(
      size: size,
      axis-style: "scientific",
      x-label: "epoch",
      x-tick-step: data.last().at(0) / 4,
      y-label: "Accuracy",
      y-min: y-min,
      y-max: 100,
      y-tick-step: 10,
      y-format: tick => str(tick) + "%",
      y2-label: "Loss",
      y2-min: 0,
      y2-max: 3,
      y2-tick-step: 0.5,
    {
      plot.add(
        loss,
        axes: ("x", "y2"),
        style: (stroke: 0.5pt + colour_highlight),
      )
      plot.add(
        accuracy,
        style: (stroke: 0.5pt + colour_analysis),
      )
      plot.add-hline(
        test.at(id).at(2) * 100,
        style: (stroke: 1.5pt + black)
      )
      plot.annotate({
        content((data.last().at(0) / 2, 100 - 7 * (100 - calc.round(100/label-count)) / 100), title)
      })
    })
  })
}

// functions
#let titlecase(string) = { // taken from https://github.com/typst/typst/issues/1707#issuecomment-1635411094
  string.replace(
    regex("[A-Za-z]+('[A-Za-z]+)?"),
    word => upper(word.text.first()) + lower(word.text.slice(1)),
  )
}
#let pn(number, total) = [$#{calc.round(100*number/total, digits: 2)}%$ ($#number$)]
#let pcite = cite.with(form: "prose")
#let sql(label, caption, query) = {
  set raw(lang: "sql", theme: "assets/light.tmtheme")
  [#figure(
    block(inset: 1em, radius: 3pt, width: 103%, stroke: rgb("aaa"), align(left)[#query]),
    caption: caption,
    kind: "sql",
    supplement: [SQL listing],
    gap: 0.75em,
  ) #label]
}

#show: project.with(
  title: [A Testbed for Voice Assistant \ Traffic Fingerprinting],
  authors: (
    (name: "Milan van Zanten", email: "milan.vanzanten@unibas.ch"),
  ),
  examiner: (
    name: "Prof. Dr. Isabel Wagner",
    role: "Examiner",
  ),
  group: (
    faculty: "Faculty of Science",
    department: "Department of Mathematics and Computer Science",
    name: "Privacy-Enhancing Technologies Group",
    url: "https://pet.dmi.unibas.ch",
  ),
  acknowledgement: par(justify: true, block(width: 80%)[
    Firstly, I would like to thank Prof. Dr. Isabel Wagner for the opportunity to write my master thesis in her research group. Her insightful perspectives during our progress meetings have been invaluable. Furthermore, I am especially grateful for the help and explanations provided by Diego, Shiva and Patrick, who have generously shared their knowledge with a machine learning novice like me. I also wish to express my heartfelt appreciation to Patricia and Heike for assisting me with navigating the bureaucratic process of delaying my thesis during a difficult time. Last, but certainly not least, I thank Michelle, my family, and my friends for supporting me throughout the work on this thesis. I would not have been able to complete it without them.
  ]),
  abstract: par(justify: true, block(width: 80%)[
    In this thesis, we investigate the viability of traffic fingerprinting on voice assistants like Apple's Siri. Initially, we built a testbed that can house different kinds of smart speakers and autonomously interact with the voice assistants running on them. It collects network captures, audio recordings, timestamps and a transcript of the voice assistant's response. With this system, we conducted an experiment over one month to collect data on approximately 70'000 interactions with an Apple HomePod. As a proof of concept, the collected traffic traces were used to train a machine learning model that can infer what was said to the voice assistant from only the network traffic. Our findings indicate that our system can correctly classify unseen network traffic with an accuracy of up to 86%, revealing a vulnerability in Siri's security and privacy measures.
  ]),
  date: "February 29, 2024",
  logo: "images/logo-en.svg",
)

= Introduction <introduction>
Smart Speakers have become ubiquitous in many countries. The global sale of Amazon Alexa devices surpassed half a billion in 2023 @Amazon_2023 and Apple sells over $10$ million HomePods every year @Statista_Apple_2023. A study shows that $35%$ of adults in the U.S. own a smart speaker @NPR_2022. With half of the participants in the study reporting to have heard an advertisement on their smart speaker before, concerns about the privacy and security of these ever-listening devices are well founded. Especially since they are usually placed where private or confidential conversations take place.

To back claims about loss of privacy or decreased security, a thorough analysis of smart speakers becomes necessary. In this thesis, we build a system that allows us to collect a large amount of data on interactions with voice assistants.

== Previous Work
There are several previous works analysing the privacy and security risks of voice assistants. Most of them go into topics concerning specifically Amazon Alexa since their adoption rate is significantly higher than Apple's Siri or Google Assistant @Bolton_2021. Some examples include experiments of smart speaker misactivations @Ford2019 @Dubois_2020 @schonherr2022101328, analyses of voice assistant #gls("skill")s @Natatsuka_2019 @Shezan_2020 @255322 @Edu_2022 @Xie_2022 and voice command fingerprinting @8802686 @Wang_2020aa. Apart from some old work on traffic patterns of Siri @caviglione_2013 and research by Apple @Apple_2023 @Apple_2018, we found little information about that voice assistant.

== Thesis Overview <overview>
The primary bottleneck when collecting large amounts of data on voice assistants is the actual time speaking and waiting for the response. According to our tests, the majority of interactions take between $4$ and $8$ seconds plus any additional delay between the question and the response. This aligns with the results by #pcite(<Haas_2022>), who found a median of $4.03$s per typical sentence. With the goal of collecting a large number of samples for a list of voice commands (henceforth called "queries"), the system will therefore need to run for a substantial amount of time.

With this in mind, the primary aim of this thesis is building a testbed that can autonomously interact with a smart speaker. This system should take a list of queries and continuously go through it while collecting audio recordings, network packet traces, timestamps and other relevant metadata about the interaction. In @background, we establish why it is of interest to collect this data and the implementation of our system is explored in @system-design.

For the second part, documented in Sections @experiment[] and @results[], we run an experiment using our system, collecting interactions with an Apple HomePod mini#footnote[#link("https://apple.com/homepod-mini")] smart speaker. With this dataset of interactions, we train an #gls("ml") model for traffic fingerprinting. The system is adapted from a similar implementation by #pcite(<Wang_2020aa>) but will be applied to data collected from the Siri voice assistant instead of Alexa and Google Assistant.

#pagebreak()
= Background <background>
Having defined our unsupervised system for collecting interactions with smart speakers, it is necessary to explain the reasoning behind collecting that data. In the taxonomy in @diag-traffic-collection-taxonomy, adapted from a more detailed version in the survey by #pcite(<Papadogiannaki_2021>), we can see that security and privacy are two of the main focus points for research on the analysis of encrypted network traffic.

#diagram(<diag-traffic-collection-taxonomy>)[A Taxonomy for research about encrypted network traffic analysis @Papadogiannaki_2021.][
  #fletcher.diagram(
    node-stroke: 1pt,
    node-fill: rgb("eee"),
    edge-stroke: 1pt,
    node-corner-radius: 4pt,
    edge-corner-radius: 4pt,
    spacing: 1.5em,
    node((0, 0), align(center)[Encrypted Traffic \ Analysis and Inspection]),
      edge_("rr"),
      edge_("r", "d", "r"),
      edge_("r", "dd", "r"),
      edge_("r", "ddd", "r"),
    node((2, 0), [Analytics]),
    group_node(colour_highlight, (2, 1), [Security]),
    group_node(colour_highlight, (2, 2), [User Privacy]),
    node((2, 3), [Middleboxes#footnote[Devices used for network function, e.g. NATs or firewalls.]]),
    node((4, 0), align(center)[Manipulation of Metadata \ and Characteristics]),
    group_node(colour_highlight, (4, 1), align(center)[Interception of \ Encrypted Traffic]),
    node((4, 2), align(center)[Utilisation of \ Cryptographic Functions]),
    node((2, -0.9), [Use Cases], fill: none, stroke: none),
      group_edge(black, "", (1.1, -0.2), (1.1, -0.6), (2.85, -0.6), (2.85, 3.7), (1.1, 3.7), (1.1, 3.4)),
      edge_((2.85, 0), (4, 0)),
      edge_((2.85, 0), (3, 0), "d", "r"),
      edge_((2.85, 0), (3, 0), "dd", "r"),
    node((4, -0.9), [Techniques], fill: none, stroke: none),
      group_edge(black, "", (3.1, -0.2), (3.1, -0.6), (4.9, -0.6), (4.9, 2.6), (3.1, 2.6), (3.1, 2.25)),
  )
]
#cite(<Papadogiannaki_2021>, form: "author") write in their survey #quote(attribution: <Papadogiannaki_2021>, [In the second category #emph[interception of encrypted traffic,] we encounter solutions that take advantage of behavioural patterns or signatures to report a specific event.]) The main focus for us lies on the encrypted network traffic being collected. This traffic cannot be decrypted, however, we can analyse it using a technique called traffic fingerprinting.

== Traffic Fingerprinting
Traffic fingerprinting uses pattern recognition approaches on collected, encrypted traffic traces to infer user behaviour. This method is used successfully to analyse web traffic @Wang_2020, traffic on the Tor network @Oh_2017, #gls("iot") device traffic @8802686 @Wang_2020aa @Mazhar_2020aa @Zuo_2019 @Trimananda_2019aa and user location @Ateniese_2015aa. Traffic fingerprinting on smart speakers is generally used to infer the command given to the voice assistant by the user. To enable us to run traffic fingerprinting, our system stores the query and the network traffic for each interaction it has with the smart speaker.

We assume the concept of #emph[open world] and #emph[closed world] models from previous traffic fingerprinting works @Wang_2020aa @Wang_2020. Both models define an attacker who has a set of queries they are interested in. The interest in the open world scenario is whether or not a given command is in that set. The traffic analysed in this case can be from either known queries or queries not seen before. In the closed world scenario, the attacker wants to differentiate between the different queries in that set, but cannot tell whether a new query is in it or not. As an example, the open world model could be used to infer whether the smart speaker is being used to communicate with a person that an attacker is monitoring. In the closed world case, the attacker might want to differentiate between different people a user has contacted. In our thesis, we focus on the closed world model.

== Threat Model <threat-model>
Our threat model is similar to the ones introduced by #pcite(<Wang_2020aa>) and @8802686, specifically,

- the attacker has access to the network the smart speaker is connected to;
- the MAC address of the smart speaker is known;
- the type of the voice assistant is known (e.g. Siri, Alexa or Google Assistant).

Previous work @Mazhar_2020aa @Trimananda_2019aa shows that it is feasible to extract this information from the encrypted network traffic. Typically unencrypted DNS queries can also be used to identify smart speakers on a network. Our own analysis shows that the Apple HomePod regularly sends DNS queries asking about the domain `gsp-ssl.ls-apple.com.akadns.net` and #pcite(<Wang_2020aa>) found that the Amazon Echo smart speaker sends DNS queries about `unagi-na.amazon.com`. The specific domains queried may differ regionally (as indicated by the `-na` part in the Amazon subdomain), but should be simple to obtain.

Finally, we assume the traffic is encrypted, since the attack would be trivial otherwise.

== Voice Assistant Choice
The most popular voice assistants are Apple's Siri, Amazon Alexa and Google Assistant @Bolton_2021. Support for Microsoft Cortana was dropped in August of 2023 @Microsoft_Cortana_2023, likely in favour of Copilot for Windows, which was announced one month later @Microsoft_Copilot_2023 (for more on this see @vas-based-on-llms).

Our system supports the Siri and Alexa voice assistants but can be easily extended with support for more. For the experiment in @experiment, we decided to use Siri as it is still less researched @Bolton_2021 than Alexa, where a host of security and privacy issues have already been found @Wang_2020aa @8802686 @Ford2019 @Edu_2022 @Dubois_2020.

#pagebreak()
= System Design <system-design>
The primary product of this thesis is a testbed that can autonomously interact with a smart speaker for a long time. Our system, called #gls("varys"), collects the following data on its interactions with the voice assistant:

- Audio recordings of the query and the response
- A text transcript of the query and the response
- Encrypted network traffic traces from and to the smart speaker
- The durations of different parts of the interaction

== Modularisation
The wide range of data our system needs to collect makes it necessary to keep its complexity manageable. To simplify maintenance and increase the extensibility, the functionality of our system is separated into five modules:

/ #gls("varys"): The main executable combining all modules into the final system.
/ varys-analysis: Analysis of data collected by varys.
/ varys-audio: Recording audio and the #gls("tts") and #gls("stt") systems.
/ varys-database: Abstraction of the database system where interactions are stored.
/ varys-network: Collection of network traffic, writing and parsing of #gls("pcap") files.

The modules varys-audio, varys-database and varys-network are isolated from the rest of #gls("varys"). If the need arises (e.g. if a different database connector is required or a more performant #gls("tts") system is found), they can be easily swapped out. The dependencies between all modules are shown in @diag-module-dependencies.

#diagram(<diag-module-dependencies>)[Module dependencies.][
  #fletcher.diagram(
    node-stroke: 1pt,
    edge-stroke: 1pt,
    node-corner-radius: 4pt,
    edge-corner-radius: 4pt,
    spacing: 1.5em,
    node((0, 0), `varys`, fill: rgb("eee")),
      edge_("r"),
      edge_("dd", "l", "d"),
      edge_("ddd"),
      edge_("dd", "r", "d"),
    group_node(colour_analysis, (1, 0), "varys-analysis"),
      edge_("d", (0.12, 1), (0.12, 2.625)),
      edge_("d", (1.2, 1), (1.2, 2.625)),
    group_node(colour_audio, (-1, 3), "varys-audio"),
    group_node(colour_network, (0, 3), "varys-network"),
    group_node(colour_database, (1, 3), "varys-database"),
  )
]

The module varys-analysis does not currently do any analysis of the recorded audio and thus doesn't depend on that module.

In this section, we will lay out the design considerations behind the modules that make up #gls("varys"). The colours shown in @diag-module-dependencies will indicate which module each specific functionality belongs to.

== Network <network>
The system runs on a device that serves as a #gls("mitm") between the Internet and an #emph[internal network], which it acts as a router for. The latter is reserved for the devices that are monitored. This includes the voice assistant and any devices required by it. Since the majority of smart speakers do not have any wired connectivity, the #emph[internal network] is wireless.

#diagram(<diag-network>)[Visualisation of the network layout.][
  #fletcher.diagram(
    node-stroke: 1pt,
    edge-stroke: 1pt,
    node-corner-radius: 4pt,
    edge-corner-radius: 4pt,
    node-fill: rgb("eee"),
    spacing: 3em,
    node((-1, 0), align(center)[`WiFi Bridge` \ `(optional)`#footnote[Only required if the #gls("mitm") device cannot be connected directly while acting as router for the internal network.]]),
    node((-2, 0), `Internet`),
    group_node(colour_network, (0, 0), `MITM device`),
      edge((-1, 0), `eth`),
    group_node(colour_network, (1, -0.5), `Smart Speaker`),
    group_node(colour_network, (1, 0.5), [`Additional Devices`#footnote[In the case of the HomePod, this is the iPhone required for #gls("personal requests").]]),
    node((-2, 1), `Monitoring`),
     edge((-2, 0)),
    node((-2, -1), [`VA Server`]),
     edge((-2, 0)),
    node((0.5, -1), `internal network`, fill: none, stroke: none),
    node((-1, -0.6), `external network`, fill: none, stroke: none),
      edge((0.4, -0.1), (0.4, 0.1), bend: 45deg),
      edge((0.45, -0.2), (0.45, 0.2), bend: 45deg),
      edge((0.5, -0.3), (0.5, 0.3), bend: 45deg),
      edge((-1.4, -0.1), (-1.4, 0.1), bend: -45deg),
      edge((-1.45, -0.2), (-1.45, 0.2), bend: -45deg),
      edge((-1.5, -0.3), (-1.5, 0.3), bend: -45deg),
    render: (grid, nodes, edges, options) => {
      let external = (nodes.at(0), nodes.at(1), nodes.at(2))
      let internal = (nodes.at(2), nodes.at(3), nodes.at(4))
      cetz.canvas({
        enclose_nodes(external, rgb(0, 0, 0, 50%), clearance: (25pt, 28pt, 25pt, 0pt))
        enclose_nodes(internal, colour_network_transparent, clearance: (20pt, 55pt, 20pt, 45pt))
        fletcher.draw-diagram(grid, nodes, edges, options)
      })
    }
  )
]

This layout, visualised in @diag-network, enables the #gls("mitm") device to record all incoming and outgoing traffic to and from the voice assistant without any interference from other devices on the network. (We filter the network traffic by the MAC address of the smart speaker so this is not strictly necessary, but it helps to reduce the amount of garbage data collected.)

Currently, we assume that the beginning and end of an interaction with the voice assistant is known and we can collect the network capture from when the user started speaking to when the voice assistant stopped speaking. We discuss this restriction in more detail in @traffic-trace-bounds.

The varys-network module provides functionality to start and stop capturing traffic on a specific network interface and write it to a #gls("pcap") file. Furthermore, it offers a wrapper structure around the network packets read from those files, which is used by varys-analysis. Each smart speaker has a different MAC address that is used to distinguish between incoming and outgoing traffic. The MAC address is therefore also stored in the database alongside the path to the #gls("pcap") file.

=== Monitoring <monitoring>
Since the system is running for long periods of time without supervision, we need to monitor potential outages. We added support to #gls("varys") for #emph[passive] monitoring. A high-uptime server receives a request from the system each time it completes an interaction. That server in turn notifies us if it hasn't received anything for a configurable amount of time.

== Audio
The largest module of our system is varys-audio. Its main components are the #gls("tts") and #gls("stt") systems, expanded upon in the next two subsections. Apart from that, it stores the recorded audio files encoded in the space-optimised OPUS @rfc6716 format and handles all audio processing. Converting it to mono; downsampling it for space efficiency; and trimming the silence from the beginning and end.

=== Text-to-Speech <text-to-speech>
The only interface that a voice assistant provides is via speaking to it. Therefore, we need a system to autonomously talk, for which there are two options:

- Prerecorded audio
- Speech synthesis

The former was dismissed early for several reasons. Firstly, audio recordings would not support conversations that last for more than one query and one response. Secondly, each change in our dataset would necessitate recording new queries. Lastly, given the scope of this thesis, recording and editing more than 100 sentences was not feasible.

Thus, we decided to integrate a #gls("tts") system into our varys-audio module. This system must provide high quality synthesis to minimise speech recognition mistakes on the smart speaker, and be able to generate speech without any noticeable performance overhead. The latter is typically measured by the #gls("rtf"), with $#gls("rtf") > 1$ if the system takes longer to synthesise text than to speak that text and $#gls("rtf") <= 1$ otherwise. A good #gls("rtf", display: "real-time factor") is especially important if our system is to support conversations in the future; the voice assistant only listens for a certain amount of time before cancelling an interaction.

We explored three different options with $#gls("rtf") <= 1$ on the machine we tested on:

- espeak#footnote(link("https://espeak.sourceforge.net"))
- Larynx#footnote(link("https://github.com/rhasspy/larynx"))
- `tts-rs` using the `AVFoundation` backend#footnote[#link("https://github.com/ndarilek/tts-rs") – the `AVFoundation` backend is only supported on macOS.]

We primarily based our choice on how well Amazon Alexa#footnote[We picked Alexa (over Siri) for these tests since its speech recognition performed worse in virtually every case.] would be able to understand different sentences. A test was performed manually by asking each of three questions 20 times with every #gls("tts") system. As a ground truth, the same was repeated verbally. The results, found in @fig-tts-accuracy, show that `tts-rs` with the `AVFoundation` backend performed approximately $20%$ better on average than both Larynx and espeak and came within $15%$ of the ground truth.

#figure(caption: [Recognition accuracy of sentences synthesised by different #gls("tts") systems and one human voice.])[
  #cetz.canvas({
    import cetz.draw: *
    import cetz.chart

    let data = (
      (quote[Who am I?], 90, 100, 75, 90),
      (align(center, quote[How cold \ is it?]), 100, 75, 65, 95),
      (align(center, quote[How old is \ Ian McKellen?]), 100, 75, 45, 15),
      ([Average], 96.66666667, 83.33333333, 61.66666667, 66.66666667),
    )

    set-style(legend: (orientation: ltr, padding: 5pt, item: (spacing: 10pt)))
    chart.columnchart(
      size: (11, 4),
      mode: "clustered",
      value-key: (1, 2, 3, 4),
      labels: ("spoken", `AVFoundation`, "Larynx", "espeak"),
      legend: "legend.north",
      y-format: tick => str(tick) + "%",
      y-tick-step: 25,
      y-label: "Accuracy",
      bar-style: idx => (fill: (colour_audio.lighten(80%), colour_audio.lighten(50%), colour_audio, colour_audio.darken(20%)).at(idx)),
      data
    )
  })
] <fig-tts-accuracy>

While performing this test, the difference in quality was very easily heard, as only the `AVFoundation` voices come close to sounding like a real voice. These were, apart from the ground truth, also the only voices that could be differentiated from each other by the voice assistants. @fig-tts-voice-recognition shows how well Alexa was able to distinguish between two voices registered with voice recognition and one unregistered "guest" voice. The recognition of user voices works well while the guest voice was wrongly recognised as one of the others $65%$ of the time.

#figure(caption: [Recognition accuracy of different voices synthesised by `AVFoundation`.])[
  #cetz.canvas({
    import cetz.draw: *
    import cetz.chart

    let data = (
      ([Voice 1], 100),
      ([Voice 2], 90),
      ([Guest], 35),
    )

    set-style(legend: (orientation: ltr, padding: 5pt, item: (spacing: 10pt)))
    chart.columnchart(
      size: (6, 4),
      value-key: (1),
      legend: "legend.north",
      y-format: tick => str(tick) + "%",
      y-tick-step: 25,
      y-label: "Accuracy",
      bar-style: idx => (fill: (colour_audio.lighten(60%), colour_audio, colour_audio.darken(20%)).at(idx)),
      data
    )
  })
] <fig-tts-voice-recognition>

With these results, we decided to use `tts-rs` on a macOS machine to get access to the `AVFoundation` backend.

Since running this test, Larynx has been succeeded by Piper#footnote(link("https://github.com/rhasspy/piper")) during the development of our system. The new system promises good results and may be an option for running #gls("varys") on non-macOS systems in the future.

=== Speech-to-Text <speech-to-text>
Not all voice assistants provide a history of interactions, which is why we use a #gls("stt") system to to transcribe the recorded answers.

In our module varys-audio, we utilise an #gls("ml")-based speech recognition system called #gls("whisper"). It provides transcription error rates comparable to those of human transcription @Radford_2022aa and outperforms other local speech recognition models @Seagraves_2022 and commercial services @Radford_2022aa. In @transcription-accuracy, we explore how well #gls("whisper") worked for our use case.

== Data Storage
The module varys-database acts as an abstraction of our database, where all metadata about interactions are stored. Audio files and #gls("pcap") traces are saved to disk by varys-audio and varys-network respectively and their paths are referenced in the database.

Since some changes were made to the system while it was running, we store the version of #gls("varys") a session was ran with. This way, if there are any uncertainties about the collected data, the relevant code history and database migrations can be inspected.

The final schema of the database is shown in @tab-schemas. To prevent data corruption if the project is moved, all paths are relative to a main `data` directory.

#figure(
  caption: [Schema of the `interaction`, `session` and `interactor_config` tables.],
  text(size: 0.8em, block(width: 120%, grid(
    columns: (1fr, 1fr),
    gutter: 25pt,
    table(
      inset: 4pt,
      stroke: 0.5pt,
      columns: 2,
      `id`, [Sequential id of the *interaction*#footnote[Stays sequential across sessions for easier identification.]],
      `session_id`, "Foreign key of the session",
      `query`, "What was spoken",
      `query_duration`, "Speaking duration in ms",
      `response`, "Transcription of the response",
      `response_duration`, "Response duration in ms",
      `query_file`, "Path of the query audio file",
      `response_file`, "Path of the response audio file",
      `capture_file`, "Path of the network capture file",
      `started`, "Timestamp when the interaction began",
      `ended`, [Timestamp when the interaction ended#footnote[This includes the time taken to transcribe the response audio.]],
      `query_category`, "The category of the query",
      `assistant_mac`, "The MAC address of the smart speaker",
    ),
    grid(
      columns: (1fr),
      gutter: 26.5pt,
      table(
        inset: 4pt,
        stroke: 0.5pt,
        columns: 2,
        `id`, [Sequential id of the *session*#h(3.8em)],
        `version`, [The version of #gls("varys")],
        `interactor_config_id`, "Foreign key of the configuration",
        `data_dir`, "Path to where this session is stored",
        `started`, "Timestamp when the session began",
        `ended`, "Timestamp when the session ended",
      ),
      table(
        inset: 4pt,
        stroke: 0.5pt,
        columns: 2,
        `id`, [Sequential id of the *configuration*],
        `interface`, "The network interface where traffic is collected",
        `voice`, "The voice used for speaking",
        `sensitivity`, "The amplitude of audio considered silence",
        `model`, [The #gls("whisper") model used for transcription],
      ),
    )
  )))
) <tab-schemas>

== Interaction Process
The main executable of #gls("varys") combines all modules introduced in this section to run a series of interactions. When the system is run, it loads the queries from a TOML file formatted like the example in @lst-queries. Until it is stopped, the system then runs interaction sessions, each with a shuffled version of the original query list.

#figure(caption: [Example of a very small dataset of four queries grouped in three categories.])[```toml
conversion = [
    "What are 130 miles in yards?",
    "What’s 2330 dollars in euros?",
]

mathematics = [
    "What’s 9 plus 53?",
]

home = [
    "Open the garage",
]
```] <lst-queries>

The exact process is shown in @diag-data-collection. Resetting the voice assistant is implemented differently for each one supported. As an example, the exact procedure of resetting Siri is shown in @algo-reset-siri.

#align(center, box(width: 75%, [#algorithm(
  caption: [Resetting Siri after a recording timeout.],
  pseudocode(
    [*say* #emph["Hey Siri, stop."]],
    [*wait* for 2s of silence],
    [*say* #emph["Hey Siri, turn off the music."]],
    [*wait* for 2s of silence],
    [*say* #emph["Hey Siri, disable all alarms."]],
    [*wait* for 2s of silence],
  ),
) <algo-reset-siri>]))

#diagram(<diag-data-collection>)[Data collection flow chart.][
  #fletcher.diagram(
    node-stroke: 1pt,
    edge-stroke: 1pt,
    node-corner-radius: 4pt,
    edge-corner-radius: 4pt,
    node-fill: rgb("eee"),
    spacing: 1.5em,
      edge((0, -1), (0, 0), "@-"),
    group_node(colour_database, (0, 0), `new session with input query list`),
      edge_(),
    node((0, 1), `choose next voice from ring buffer`),
      edge_(),
    node((0, 2), `shuffle queries`),
      edge_(),
    node((0, 3), `take next query`),
      edge_("r", "uuu", "l", `no queries left`),
      edge_(),
    group_node(colour_network, (0, 4), `begin network capture`),
      group_edge(colour_network, `network capture`, "rr", "dddddddd", "ll"),
      edge_(),
    group_node(colour_audio, (0, 5), `begin recording query audio`),
      group_edge(colour_audio, `query audio`, "r", "dd", "l"),
      edge_(),
    node((0, 6), `speak query`),
      edge_(),
    group_node(colour_audio, (0, 7), `end audio recording`),
      edge_(),
    group_node(colour_audio, (0, 8), `begin recording response audio`),
      group_edge(colour_audio, `response audio`, "r", "ddd", "l"),
      edge_(),
    node((0, 9), `wait for silence`),
      edge_((0, 11), `silence detected`),
      edge_("ll", `timeout`),
    group_node(colour_audio, (-2, 9), `cancel recording`),
      edge_(),
    group_node(colour_network, (-2, 8), `cancel capture`),
      edge_(),
    node((-2, 7), `reset assistant`),
      edge("uuuu", "rr"),
    group_node(colour_audio, (0, 11), `end audio recording`),
      edge_(),
    group_node(colour_network, (0, 12), `end network capture`),
      edge_(),
    group_node(colour_audio, (0, 14), `transcription in progress?`),
      group_edge(colour_audio, `transcription`, (0.4, 13.6), (0.4, 12.5), (2, 12.5), (2, 16.3), (0.4, 16.3), (0.4, 16.25)),
      edge_((0, 16), `no`, label-side: left),
      edge_((1, 14), `yes`),
    group_node(colour_audio, (1, 14), `transcription done?`),
      edge_((1, 14), (1, 14), [`no`#footnote[Sleep for a short amount of time before checking again.]], label-side: center, bend: 115deg),
      edge_((1, 17), `yes`, label-side: left),
    group_node(colour_database, (1, 17), [`complete interaction`#footnote[The previous interaction whose transcription was just completed.]]),
      edge_("l", "u"),
    group_node(colour_audio, (0, 16), `begin next transcription`),
      edge_("l", "uuuuuuuuuuuuu", "r"),
    node((2.5, 17), text(colour_database)[`database`], fill: none, stroke: none),
      group_edge(colour_database, ``, (1, 17)),
  )
]

== Performance Considerations <performance>
The initial design of our system intended for it to be run on a Raspberry Pi, a small form factor computer with limited processing and storage capabilities. Mostly because we did not find a #gls("tts") system that ran with satisfactory performance on the Raspberry Pi, we now use a laptop computer.

=== Data Compression
Because of the initial plan of using a Raspberry Pi, we compress the recorded audio with the OPUS @rfc6716 codec and the captured network traffic with DEFLATE @rfc1951. We are however now running the experiment on a laptop, where space is no longer a concern. The audio codec does not add a noticeable processing overhead, but the #gls("pcap") files are now stored uncompressed. Moreover, storing the network traces uncompressed simplifies our data analysis since they can be used without any preprocessing.

=== Audio Transcription
Apart from the actual time speaking (as mentioned in @overview), the transcription of audio is the main bottleneck for how frequently our system can run interactions. The following optimisations allowed us to reduce the average time per interaction from $~120$s to $~30$s:

==== Model Choice
There are five different sizes available for the #gls("whisper") model, shown in @tab-whisper-sizes. The step from the `medium` to the `large` and `large-v2` models does not significantly decrease #gls("wer") but more than doubles the number of parameters. Since in our tests, this increase in network size far more than doubles processing time, we opted for the `medium` model.

#figure(caption: [Comparison of #gls("whisper") models by #gls("wer") on English datasets @Radford_2022aa.])[
  #let tiny = (39, calc.round((15.7, 28.8, 11.6, 12.4).sum() / 4, digits: 2))
  #let base = (74, calc.round((11.7, 21.9, 9.5, 8.9).sum() / 4, digits: 2))
  #let small = (244, calc.round((8.3, 14.5, 8.2, 6.1).sum() / 4, digits: 2))
  #let medium = (769, calc.round((6.8, 11.2, 7.6, 4.4).sum() / 4, digits: 2))
  #let large = (1550, calc.round((6.3, 10.1, 7.2, 4.5).sum() / 4, digits: 2))
  #let large-v2 = (1550, calc.round((6.2, 9.4, 7.0, 4.2).sum() / 4, digits: 2))
  #grid(
    columns: (1fr, 1fr),
    table(
      columns: 3,
      align: (left, right, right),
      [*Model*], [*Parameters*], [*Word Error Rate*],
      `tiny`, [#tiny.at(0)M], [#tiny.at(1)%],
      `base`, [#base.at(0)M], [#base.at(1)%],
      `small`, [#small.at(0)M], [#small.at(1)%],
      `medium`, [#medium.at(0)M], [#medium.at(1)%],
      `large`, [#large.at(0)M], [#large.at(1)%],
      `large-v2`, [#large-v2.at(0)M], [#large-v2.at(1)%],
    ),
    cetz.canvas({
      import cetz.draw: *
      import cetz.plot

      plot.plot(
        size: (6, 4),
        axis-style: "scientific-auto",
        x-label: "Parameters",
        x-format: tick => if tick > 0 { str(tick) + "M" } else { str(tick) },
        x-tick-step: 500,
        x-min: 0,
        y-label: [#gls("wer")],
        y-format: tick => str(tick) + "%",
        y-tick-step: 5,
        y-min: 0,
        y-max: 20,
      {
        plot.add(
          (tiny, base, small, medium, large-v2),
          style: (stroke: 1pt + colour_audio),
          mark: "o",
          mark-style: (stroke: colour_audio, fill: colour_audio.lighten(80%)),
        )
      })
    })
  )
] <tab-whisper-sizes>

==== Transcription in Parallel
The bottleneck of waiting for #gls("whisper") to transcribe the response could be completely eliminated with a transcription queue that is processed in parallel to the main system. However, if the system is run for weeks at a time, this can lead to arbitrarily long post-processing times if the transcription takes longer on average than an interaction.

Our solution is to process the previous response while the next interaction is already running, but to wait with further interactions until the transcription queue (of length 1) is empty again. This approach allows us to reduce the time per interaction to $max(T_t, T_i)$, rather than $T_t + T_i$, where $T_t$ represents the time taken for transcription and $T_i$ the time taken for the rest of the interaction.

==== Audio Preprocessing
Another step we took to reduce the transcription time is to reduce the audio sample rate to $16$ kHz, average both channels into one and trim the silence from the start and the end. This preprocessing does not noticeably reduce the quality of the transcription. Moreover, cutting off silent audio eliminates virtually all #gls("whisper") hallucinations (see @transcription-accuracy).

#pagebreak()
= Experiment <experiment>
To test our data collection system, we conducted several experiments where we let #gls("varys") interact with an Apple HomePod mini.

Three different lists of queries were tested:

- A large dataset with $227$ queries, split into $24$ categories
- A small dataset with $13$ queries, each from a different category
- A binary dataset with the two queries "Call John Doe" and "Call Mary Poppins"

Apple does not release an official list of Siri's capabilities. A dataset of $~800$ Siri commands was originally compiled from a user-collected list @Hey_Siri_Commands and extended with queries from our own exploration. The HomePod does not support all Siri queries#footnote[Any queries that require Siri to show something to the user will be answered with something along the lines of #emph(quote[I can show you the results on your iPhone.])]. Therefore, during an initial test period, we manually removed commands from the list that did not result in a useful response. The number of redundant commands was further reduced to increase the number of samples that can be collected per query. Having less than 256 labels has the additional advantage of our query labels fitting in one byte.

The full lists of queries are shown in Tables @tab-dataset-large[], @tab-dataset-small[] and @tab-dataset-binary[]. Further analysis of our results can be found in @results. In this section we detail the experiment setup and our traffic fingerprinting system.

== Setup
The experiment setup consists of a laptop running macOS#footnote[The choice of using macOS voices is explained in @text-to-speech.], connected to a pair of speakers and a microphone. The latter, together with the HomePod, are placed inside a sound-isolated box (shown in @sound-isolation). Additionally, the user's iPhone is required to be on the #emph[internal network] (see @network) for Siri to be able to respond to #gls("personal requests").

The iPhone is set up with a dummy user account for #emph[Peter Pan] and two contacts, #emph[John Doe] and #emph[Mary Poppins]. Otherwise, the phone is kept at factory settings and the only changes come from interactions with the HomePod.

Finally, to minimise system downtime, an external server is used for monitoring as explained in @monitoring.

#figure(
  caption: [Sound-isolation to improve transcription accuracy.],
  placement: auto,
  image("images/sound-isolation.jpeg", width: 60%)
) <sound-isolation>

The experiment must be set up in a location with the following characteristics:
#block(breakable:  false)[
  - Wired Internet connection#footnote[A wireless connection is not possible, since macOS does not support connecting to WiFi and creating a wireless network at the same time. If no wired connection is available, a network bridge (e.g. a Raspberry Pi bridging WiFi to its Ethernet port) can be used to connect the Mac to the internet.]
  - Power outlet available
  - No persons speaking (the quieter the environment, the higher the voice recognition accuracy)
  - Noise disturbance from experiment not bothering anyone
  - Access to room during working hours (or if possible at any time)
]

The sound isolation of the box can reduce ambient noise in the range of $50 - 60$ dB to a significantly lower $35 - 45$ dB, which increases the voice recognition and transcription accuracy. This also helps finding an experiment location, since the noise disturbance is less significant.

When the hardware is set up, the command `varys listen --calibrate` is used to find the ambient noise volume.

== Data Analysis
We began data analysis by building a simple tool that takes a number of traffic traces and visualises them as shown in @fig-example-trace. Red and blue represent incoming and outgoing packets respectively and the strength of the colour visualises the packet size. The $x$-axis does not necessarily correspond with time, but with the number of incoming and outgoing packets since the beginning of the trace. This method of analysing traffic traces aligns with previous work on traffic fingerprinting @Wang_2020aa.

#figure(
  caption: [Example traffic trace visualisation of the query #emph(quote[What's the temperature outside?]).],
  image(width: 40%, "images/plots/plot-Hey Siri. Whats the temperature outside.png"),
) <fig-example-trace>

The example in @fig-example-trace already shows a clear correlation between the traces (excluding some outliers where the HomePod was likely downloading updates or doing other unrelated tasks). To test whether we can differentiate between multiple queries, we trained several #gls("ml") classifiers on our data.

=== Traffic Preprocessing
We write a network traffic trace as $((s_1, d_1), (s_2, d_2), ..., (s_n, d_n))$, where $n$ is the number of packets in that trace, $s_i$ is the size of a packet in bytes and $d_i in {0, 1}$ is the direction of that packet.

To be used as input in our #gls("ml") network, the traffic traces were preprocessed into normalised tensors of length $475$. Our collected network traffic is converted like

$$$
  ((s_1, d_1), (s_2, d_2), ..., (s_n, d_n)) -> (p_1, p_2, ..., p_475),
$$$

where for all $i in [1, 475]$ the components are calculated as

$$$
  p_i = cases(
    (-1)^(d_i) dot s_i/1514 #h(0.8em) "if" i < n",",
    0 #h(5.5em) "else".
  )
$$$

This pads traces shorter than $475$ packets long with zeroes, cuts off the ones with more and normalises all traces into the range $[-1, 1]$#footnote[The maximum size of the packets is $1514$ bytes, which is why we can simply divide by this to normalise our data.].

=== Adapting Network from Wang et al. <adapting-network>
Since there is similar previous work on traffic fingerprinting Amazon Alexa and Google Assistant interactions by #pcite(<Wang_2020aa>), we based our network on their #gls("cnn"), shown in @diag-cnn-wang.

#diagram(<diag-cnn-wang>)[The #gls("cnn") design used in @Wang_2020aa.][
  #fletcher.diagram(
    node-stroke: 1pt,
    node-fill: rgb("eee"),
    edge-stroke: 1pt,
    node-corner-radius: 4pt,
    edge-corner-radius: 4pt,
    spacing: 1.5em,
    group_node(colour_highlight, (0, 0), align(center)[Input]),
      edge_(),
    node((0, 1), align(center)[Conv. Layer \ `[tanh]`]),
      edge_(),
    node((1, 1), align(center)[Max Pooling \ [$1$]]),
      edge_(),
    node((2, 1), align(center)[Dropout \ [$0.1$]]),
      edge_(),
    node((0, 3), align(center)[Conv. Layer \ `[elu]`]),
      edge_(),
    node((1, 3), align(center)[Max Pooling \ [$1$]]),
      edge_(),
    node((2, 3), align(center)[Dropout \ [$0.3$]]),
      edge_(),
    node((0, 5), align(center)[Conv. Layer \ `[elu]`]),
      edge_(),
    node((1, 5), align(center)[Max Pooling \ [$1$]]),
      edge_(),
    node((2, 5), align(center)[Dropout \ [$0.1$]]),
      edge_(),
    node((0, 7), align(center)[Conv. Layer \ `[selu]`]),
      edge_(),
    node((1, 7), align(center)[Max Pooling \ [$1$]]),
      edge_(),
    node((2, 7), align(center)[Global Average \ Pooling]),
      edge_(),
    node((0, 9), align(center)[Dense Layer \ `[selu]`]),
      edge_(),
    node((1, 9), align(center)[Dense Layer \ `[softmax]`]),
      edge_(),
    group_node(colour_highlight, (2, 9), align(center)[Output]),
    render: (grid, nodes, edges, options) => {
      let cnn_1 = (nodes.at(1), nodes.at(2), nodes.at(3))
      let cnn_2 = (nodes.at(4), nodes.at(5), nodes.at(6))
      let cnn_3 = (nodes.at(7), nodes.at(8), nodes.at(9))
      let cnn_4 = (nodes.at(10), nodes.at(11), nodes.at(12))
      cetz.canvas({
        enclose_nodes(cnn_1, rgb(0, 0, 0, 50%), clearance: (24pt, 35pt, 24pt, 38pt))
        enclose_nodes(cnn_2, rgb(0, 0, 0, 50%), clearance: (24pt, 35pt, 24pt, 38pt))
        enclose_nodes(cnn_3, rgb(0, 0, 0, 50%), clearance: (24pt, 35pt, 24pt, 38pt))
        enclose_nodes(cnn_4, rgb(0, 0, 0, 50%), clearance: (24pt, 44pt, 24pt, 41pt))
        fletcher.draw-diagram(grid, nodes, edges, options)
      })
    }
  )
]

In addition to a #gls("cnn"), they also tested an #gls("lstm"), a #gls("sae") and a combination of the networks in the form of ensemble learning. However, their best results for a single network came from their #gls("cnn"), so we decided to use that.

Owing to limited computing resources, we reduced the size of our network to the one shown in @diag-cnn. The network of #cite(<Wang_2020aa>, form: "author") used a pool size of $1$, which is equivalent to a #emph[no-op]. Thus, we removed all #emph[Max Pooling] layers. Additionally, we reduced the four groups of convolutional layers to one.

#diagram(<diag-cnn>)[The #gls("cnn") implemented by us.][
  #fletcher.diagram(
    node-stroke: 1pt,
    node-fill: rgb("eee"),
    edge-stroke: 1pt,
    node-corner-radius: 4pt,
    edge-corner-radius: 4pt,
    spacing: 1.5em,
    group_node(colour_highlight, (0, 0), align(center)[Input]),
      edge_(),
    node((1, 0), align(center)[Conv. Layer \ `[tanh]`]),
      edge_(),
    node((2, 0), align(center)[Dropout \ $0.1$]),
      edge_(),
    node((3, 0), align(center)[Global Average \ Pooling]),
      edge_(),
    node((4, 0), align(center)[Dense Layer \ `[elu]`]),
      edge_(),
    node((5, 0), align(center)[Dense Layer \ `[softmax]`]),
      edge_(),
    group_node(colour_highlight, (6, 0), align(center)[Output]),
    render: (grid, nodes, edges, options) => {
      let cnn_1 = (nodes.at(1), nodes.at(2), nodes.at(3))
      cetz.canvas({
        enclose_nodes(cnn_1, rgb(0, 0, 0, 50%), clearance: (24pt, 44pt, 24pt, 41pt))
        fletcher.draw-diagram(grid, nodes, edges, options)
      })
    }
  )
]

Since the hyperparameter search performed by #cite(<Wang_2020aa>, form: "author") found $180$ to be the optimal dense network size when that was the maximum in their search space ${100, 110, ..., 170, 180}$, we can assume that the true optimal size is higher than $180$. In our case we used $475$ – the size of our input.

To split out dataset into separate training, validation and testing parts, we used the same proportions as #pcite(<Wang_2020aa>)\; Of the full datasets, $64%$ were used for training, $16%$ for validation, and $20%$ for testing.

#pagebreak()
= Results <results>
Our system ran for approximately $800$ hours and collected data on a total of $73'605$ interactions. The results of the data collection for each dataset can be seen by using SQL listings @sql-large-dataset[], @sql-small-dataset[] and @sql-binary-dataset[]. The collected data is available at #link("https://gitlab.com/m-vz/varys-data").

In this section we will go over our results, beginning with the statistics of our collected dataset.

== Statistics <statistics>
Of the $73'605$ queries collected, $71'915$ were successfully completed (meaning there was no timeout and the system was not stopped during the interaction). Our system #gls("varys") is therefore able to interact with Siri with a $97.70%$ success rate#footnote[See @sql-success-rate.].

The average duration of interactions is shown in @fig-average-duration (data from SQL listings @sql-average-duration[] and @sql-average-remaining-duration[]). As mentioned in @overview, there are approximately $4$ to $8$ seconds of speaking during most interactions. The remaining duration (measured as the total duration minus the speaking duration) lies mostly between $25$ and $30$ seconds. This includes the time spent waiting for the voice assistant to answer and transcribing the response. It did not increase even for outliers with a significantly longer speaking duration.

#figure(caption: [Distribution of average query duration. Outliers above 20s are shown in @tab-average-duration-outliers.])[
  #cetz.canvas({
    import cetz.draw: *
    import cetz.plot

    let data = csv("csv/aggregate_average_duration.csv").map(item => {
      (float(item.at(0)), int(item.at(1)))
    })
    let data_remaining = csv("csv/aggregate_average_remaining_duration.csv").map(item => {
      (float(item.at(0)), int(item.at(1)))
    })

    set-style(legend: (padding: 5pt, item: (spacing: 10pt)))
    plot.plot(
      size: (14, 5),
      axis-style: "scientific-auto",
      legend: "legend.inner-north",
      x-label: "average duration [s]",
      x-tick-step: 2,
      x-min: 0,
      x-max: 36,
      y-label: "queries",
      y-tick-step: 1,
      y-min: 0,
    {
      for item in data {
        plot.add(
          ((item.at(0), 0), (..item)),
          style: (stroke: 1.5pt + colour_analysis),
        )
      }
      plot.add(
        ((100, 0), (100, 1)),
        style: (stroke: 8pt + colour_analysis),
        label: "Avg. Speaking Duration",
      )

      for item in data_remaining {
        plot.add(
          ((item.at(0), 0), (..item)),
          style: (stroke: 1.5pt + colour_highlight),
        )
      }
      plot.add(
        ((100, 0), (100, 1)),
        style: (stroke: 8pt + colour_highlight),
        label: "Avg. Remaining Duration",
      )
    })
  })
] <fig-average-duration>

#figure(caption: [Outlier queries with a speaking duration above 20s.])[
  #table(
    columns: 3,
    align: (left, right, right),
    [*Query*], [*Avg. Speaking \ Duration*], [*Avg. Remaining \ Duration*],
    "Hey Siri. Tell me a bedtime story", [$79.53$s], [$31.06$s],
    "Hey Siri. Tell me a story", [$68.86$s], [$35.73$s],
    "Hey Siri. Tell me a poem", [$28.92$s], [$21.31$s],
  )
] <tab-average-duration-outliers>

On average, a query took $2.47$ seconds ($±0.64$) to say, which is expected since all query sentences are of similar length. The responses lasted on average $4.40$ seconds with a standard deviation of $5.63$s.

In @fig-uptime we show the uptime of our system in hours from when it was set up. During the first week, there were two bugs that resulted in several downtimes (the longest of which lasted $~42$h because we could not get physical access to the system during the weekend). After that, the system ran flawlessly except during one network outage at the university and during maintenance.

#figure(caption: [Uptime of our system.])[
  #cetz.canvas({
    import cetz.draw: *
    import cetz.plot

    let data = csv("csv/uptime.csv").map(item => {
      let date = item.at(0)
      let date = datetime(
        year: int(date.slice(0, count: 4)),
        month: int(date.slice(5, count: 2)),
        day: int(date.slice(8, count: 2)),
        hour: int(date.slice(11, count: 2)),
        minute: int(date.slice(14, count: 2)),
        second: int(date.slice(17, count: 2)),
      )
      (date, int(item.at(1)))
    })
    let first = data.at(0).at(0)
    let data = data.map(item => {
      ((item.at(0) - first).hours(), item.at(1))
    })

    plot.plot(
      size: (14, 1),
      axis-style: "scientific-auto",
      plot-style: (stroke: 0pt, fill: colour_analysis),
      x-label: "time from experiment start [h]",
      x-tick-step: 50,
      x-max: data.last().at(0) - 1,
      y-label: none,
      y-tick-step: none,
    {
      plot.add(data, line: "hv", hypograph: true)
    })
  })
] <fig-uptime>

== Transcription Accuracy <transcription-accuracy>
Compared to the results by #pcite(<Radford_2022aa>) and #pcite(<Seagraves_2022>), our own results show an even higher transcription accuracy apart from errors in URLs, certain names and slight formatting differences. This is likely due to the audio coming from a synthetic voice in a controlled environment.
@tab-stt-accuracy shows the manually verified results for six randomly picked queries (see @sql-different-responses). We filtered out any invalid queries where Siri was waiting for the answer to a follow-up question or could not answer due to a network issue.

#figure(caption: [#gls("whisper") speech recognition accuracy samples.])[#table(
  columns: 5,
  align: (left, right, right, right, right),
  [*Query*], [*Correct*], [*Incomplete*], [*Incorrect*], [*Invalid*],
  emph(quote[How far away is Boston?]), pn(83, 134), [#pn(49, 134)#footnote[These were all cases where the transcription of  was missing ]], pn(2, 134), [$6$],
  emph(quote[What is the factorial of 6?]), pn(1823, 1840), pn(0, 1840), pn(17, 1840), [$6$],
  emph(quote[Define airplane]), pn(136, 144), pn(5, 144), pn(3, 144), [$4$],
  emph(quote[Call back my last missed call]), pn(137, 138), pn(1, 138), pn(0, 138), [$8$],
  emph(quote[What’s 9 plus 53?]), pn(138, 139), pn(0, 139), pn(1, 139), [$5$],
  emph(quote[What's 200 pounds in kilograms?]), pn(142, 143), pn(0, 143), pn(1, 143), [$2$],
  emph(quote[Flip a coin]), pn(3, 145), pn(1, 145), pn(141, 145), [$0$],
)] <tab-stt-accuracy>

The query #emph(quote[How far away is Boston?]) was missing #emph(quote[as the crow flies]) from the correct transcription #emph(quote[[...] it's about 5,944 kilometers as the crow flies.]) about $36%$ of the time. Together with the correctly transcribed cases, this makes up $98.51%$ of this query's data. In the response to #emph(quote[What is the factorial of 6?]), Siri provided the source #link("https://solumaths.com", "solumaths.com"), which is text that #gls("whisper") was unable to produce#footnote[In 1823 cases the name #emph[solumaths] was recognised as either #emph[solomus], #emph[solomoths], #emph[solomons], #emph[solomuths], #emph[soloomiths], #emph[salumith] or #emph[solemnus], but never correctly.]. Since mistakes in URLs are expected, we marked those transcriptions as correct.

There are some rare cases where the voice assistant responds with a single word. Since #gls("whisper") does not support recognition of audio shorter than one second, these queries have an exceedingly high failure rate. One example is the query #emph(quote[Flip a coin]), which was only recognised correctly $~2%$ of the time. Out of the $140$ incorrectly transcribed responses, $134$ were cancelled due to the recorded audio being too short.

Another problem we identified with #gls("whisper") is its tendency to hallucinate text when fed silent audio (one or more seconds of `0`-samples). Some examples of text hallucinated from empty audio is shown in @tab-stt-hallucinations. Preventing this can be done by simply trimming off all silent audio in the preprocessing step before the transcription.

#figure(caption: [#gls("whisper") speech recognition hallucinations when transcribing empty audio.])[#table(
  columns: 2,
  align: (right, left),
  [*Silent Audio* [ms]], [*Transcription*],
  $3054$, emph[When you find out about any of our videos, please like and subscribe, and we will see you all next time!],
  $744$, emph[Please see review No.106243 on PissedConsumer.com],
  $2944$, [#emph[Go to Beadaholique.com for all of your beading supply needs!] \ #emph[Thank you for watching.] \ #emph[Thank you for watching.]],
  $1376$, [#emph[It's super short.] \ #emph("$1.99.")" \ #emph("$1.99.")],
)] <tab-stt-hallucinations>

== Traffic Fingerprinting
With our implementation of a fingerprinting #gls("cnn"), we were able to get promising results. Our network was trained on the binary, small and large datasets for 1000 epochs each. Detailed training statistics can be found in @fig-test-all. The accuracy axis of the training progress is bounded by the interval $[100 / n, 100]$, where $n$ is the number of labels. The bottom of the accuracy axis is therefore equivalent to random choice. (As an example, the accuracy of the binary classifier starts at 50%, which is choosing randomly between two options.) The loss axis is bounded to $[0, 3]$ to fit our data. The loss for training our large dataset started outside that range at $5.05$.

#figure(
  caption: [A selection of training and validation progress charts.],
  block(width: 125%, grid(
    columns: (1fr, 1fr),
    ml-chart(0, 2, [Early test of binary classifier]),
    ml-chart(2, 12, [Early test of small dataset]),
    ml-chart(6, 2, [Binary dataset]),
    ml-chart(5, 13, [Small dataset]),
    ml-chart(7, 227, [Large dataset]),
    pad(left: 4em, top: 0.5em, block(stroke: 1pt + black, inset: 6pt, align(left)[
      #box(width: 15pt, height: 0.5em, fill: colour_analysis) Training Accuracy \
      #box(width: 15pt, height: 0.5em, fill: black) Test Accuracy \
      #box(width: 15pt, height: 0.5em, fill: colour_highlight) Loss
    ])),
  ))
) <fig-test-all>

Most of the trends in these graphs show that with more training, the models could likely be improved. However, since this serves as a proof of concept and due to limited time and processing capacity, we did not train each network for more than $1000$ epochs.

For binary classification between the queries "Call John Doe" and "Call Mary Poppins", our final accuracy on the test set was $~71.19%$. This result is especially concerning since it suggests this method can differentiate between very similar queries. An attacker could train a network on names of people they want to monitor and tell from the encrypted network traffic whether the voice assistant was likely used to call that person.

@fig-binary-traces shows $500$ traces from each of the two binary classification queries. These two queries are visually clearly more similar to each other compared to the traces in @fig-all-traces.

The second classification we attempted was between the $13$ different queries from @tab-dataset-small, each from a different category. The result for this dataset is an $86.19%$ accuracy on the test set. Compared to less than $8%$ for a random guess, this confirms our concerns about the privacy and security of the HomePod.

The large dataset with $227$ different queries showed an accuracy of $40.40%$. This result is likely limited by the size of our network and training capabilities. When compared to randomly guessing at $0.44%$ though, the loss of privacy becomes apparent.

With adjustments to our #gls("cnn"), more training data and hardware resources, we estimate that highly accurate classification can be achieved even between hundreds of different queries.

#figure(
  caption: [Traffic traces for the two queries in the binary dataset. \ Each image contains 500 traces.],
  block(width: 100%, grid(
    columns: (1fr, 1fr),
    gutter: 1em,
    align(right, image(width: 80%, "images/plots/plot-Hey Siri. Call John Doe.png")),
    align(left, image(width: 80%, "images/plots/plot-Hey Siri. Call Mary Poppins.png")),
  ))
) <fig-binary-traces>

#figure(
  caption: [Traffic traces for nine different queries from the small dataset. \ Each image contains 100 traces.],
  grid(
    columns: (1fr, 1fr, 1fr),
    column-gutter: 1em,
    row-gutter: 0.5em,
    [
      #image(width: 90%, "images/plots/plot-Hey Siri. Any missed calls.png") \
      #v(-15pt, weak: true)
      #emph(quote("Any missed calls?"))
    ],
    [
      #image(width: 90%, "images/plots/plot-Hey Siri. Read Calendar.png") \
      #v(-15pt, weak: true)
      #emph(quote("Read Calendar"))
    ],
    [
      #image(width: 90%, "images/plots/plot-Hey Siri. Remind me to wash the car.png") \
      #v(-15pt, weak: true)
      #emph(quote("Remind me to wash the car"))
    ],
    [
      #image(width: 90%, "images/plots/plot-Hey Siri. How far is New York from Boston.png") \
      #v(-15pt, weak: true)
      #emph(quote("How far is New York from Boston?"))
    ],
    [
      #image(width: 90%, "images/plots/plot-Hey Siri. How old is Ian McKellen.png") \
      #v(-15pt, weak: true)
      #emph(quote("How old is Ian McKellen?"))
    ],
    [
      #image(width: 90%, "images/plots/plot-Hey Siri. Is there a God.png") \
      #v(-15pt, weak: true)
      #emph(quote("Is there a God?"))
    ],
    [
      #image(width: 90%, "images/plots/plot-Hey Siri. Roll a die.png") \
      #v(-15pt, weak: true)
      #emph(quote("Roll a die"))
    ],
    [
      #image(width: 90%, "images/plots/plot-Hey Siri. Translate car from English to Spanish.png") \
      #v(-15pt, weak: true)
      #emph(quote("Translate car from English to Spanish"))
    ],
    [
      #image(width: 90%, "images/plots/plot-Hey Siri. What day was 90 days ago.png") \
      #v(-15pt, weak: true)
      #emph(quote("What day was 90 days ago?"))
    ],
  )
) <fig-all-traces>

== On-Device Recognition <on-device-recognition>
According to HomePod marketing,

#quote(block: true, attribution: [#link("https://apple.com/homepod-mini")])[HomePod mini works with your iPhone for requests like hearing your messages or notes, so they are completed on device without revealing that information to Apple.]

Since our threat model from @threat-model assumes we collect data from the network that both the HomePod and iPhone are on, we still have access to that traffic. The difference in how traffic of normal queries and #gls("personal requests") is handled can be seen in the collected traces. When looking at the visualisations of the traffic in @fig-binary-traces and the traces for #emph(quote[Any missed calls?]), #emph(quote[Read Calendar]) and #emph(quote[Remind me to wash the car]) in @fig-all-traces, (by relying on the sophistication of human pattern recognition,) a group of large packets being sent (in dark blue) after the first incoming packets can be seen across the vast majority of traces. This group of packets is missing from the normal queries seen in @fig-all-traces. We speculate a binary classifier to distinguish those two types of requests should be trivial to build. However, due to time constraints, this was left for subsequent research to do.

#pagebreak()
= Conclusion
For this thesis, we built a highly reliable system that can autonomously collect data on interacting with smart speakers for hundreds of hours. Using our system, we ran an experiment to collect data on over $70'000$ interactions with Siri on a HomePod.

The data we collected enabled us to train two #gls("ml") models to distinguish encrypted smart speaker traffic of $13$ or $227$ different queries and a binary classifier to recognise which of two contacts a user is calling on their voice assistant. With final test accuracies of $~86%$ for the small dataset, $~40%$ for the large dataset and $~71%$ for the binary classifier, our system serves as a proof of concept that even though network traffic on a smart speaker is encrypted, patterns in the traffic can be used to accurately predict user behaviour. While this has been known about Amazon Alexa @Wang_2020aa @8802686 and Google Assistant @Wang_2020aa, we have shown the viability of the same attack on the HomePod despite Apple's claims about the privacy and security of their smart speakers.

== Future Work
While working on this thesis, several ideas for improvement, different types of data collection and approaches to the analysis of our data came up. Due to time constraints, we could not pursue all of these points. In this section we will briefly touch on some open questions or directions in which work could continue.

=== Recognising Different Speakers
All popular voice assistants support differentiating between users speaking to them. They give personalised responses to queries such as #emph(quote[Read my calendar]) or #emph(quote[Any new messages?]) by storing a voice print of the user. Some smart speakers like the HomePod go a step further and only respond to requests given by a registered user @Apple_2023.

Similarly to inferring the query from a traffic fingerprint, it may be possible to differentiate between different users using the smart speaker. Our system supports different voices that can be used to register different users on the voice assistants. To keep the scope our proof of concept experiment manageable, our traffic dataset was collected with the same voice. By recording more data on the same queries with different voices, we could explore the possibility of a classifier inferring the speaker of a query from the traffic trace.

=== Inferring Bounds of Traffic Trace <traffic-trace-bounds>
Currently, we assume an attacker knows when the user begins an interaction and when it ends. Using a sliding window technique similar to the one introduced by #pcite(<Zhang_2018>), a system could continuously monitor network traffic and infer when an interaction has taken place automatically.

=== Classifier for Personal Requests
As mentioned in @on-device-recognition, the traffic traces for #gls("personal requests") show a visible group of outgoing traffic that should be straightforward to recognise automatically. A binary classifier that distinguishes the traffic of #gls("personal requests") and normal queries on the HomePod should be straightforward to build.

Since this feature appears consistently on all #gls("personal requests", display: "personal request") traces we looked at, in addition to recognising traffic for queries it was trained on, this model could feasibly also classify queries it has never seen.

=== Voice Assistants Based on LLMs <vas-based-on-llms>
With the release of Gemini by Google @Google_LLM_2024 and Amazon announcing generative AI for their Alexa voice assistant @Amazon_LLM_2023, the next years will quite certainly show a shift from traditional voice assistants to assistants based on #gls("llm", display: [Large Language Models #emph("(LLMs)")]). Existing concerns about the lack of user privacy and security when using #gls("llm")s @Gupta_2023, combined with the frequency at which voice assistants process PII, warrant further research into voice assistants based on generative AI.

=== Conversations
The system built for this thesis only supports interactions with one question and one response. To explore more complex interactions, it becomes necessary to not only transcribe the responses but to build a system that understands the semantics of the response. Similarly to the shift of voice assistants to run on #gls("llm")s (mentioned in @vas-based-on-llms), future work could leverage the same technology, resulting in basically a conversation between two chat bots.

As a simple example in @fig-llm-varys, we used the popular GPT-4 model#footnote[See #link("https://openai.com/gpt-4")] as a simulated user to run an interaction with Siri

#figure(
  caption: [GPT-4 simulating a voice assistant user.#footnote[Conversation accessible at #link("https://chat.openai.com/share/4bf51528-954a-443f-8f89-2be0e08af7e5").]],
  align(left, text(size: 0.9em)[
    #quote(block(width: 75%)[
      You serve as a user talking to a voice assistant, anything you say will be played back to a HomePod with Siri. You will receive replies from Siri to an initial query. Anything you answer will be said to the voice assistant, so do not add any information that should not be said out loud. The only exception is if you believe the interaction to be over, reply with exclusively [INTERACTION COMPLETE], including the square brackets.

      It is imperative that you stay in the role of the user using Siri and do not switch to the role of the voice assistant.

      The first query to the voice assistant was "Hey Siri, add a new calendar entry" and the first reply was "Sure, for what?"
    ], block: true, attribution: "varys")
    #quote(block(width: 75%)[
      "Set up a meeting titled 'Project Sync-Up' for March 3rd at 10 AM."
    ], block: true, attribution: "ChatGPT")
    #quote(block(width: 75%)[
      "Okay, project sync up for march 3rd at 10 AM added to your calendar"
    ], block: true, attribution: "varys")
    #quote(block(width: 75%)[
      "Hey Siri, can you set a reminder for me to prepare the meeting agenda the day before?"
    ], block: true, attribution: "ChatGPT")
    #quote(block(width: 75%)[
      "Sure, i set a reminder for you to prepare the meeting agenda on march 2nd"
    ], block: true, attribution: "varys")
    #quote(block(width: 75%)[
      [INTERACTION COMPLETE]
    ], block: true, attribution: "ChatGPT")
  ])
) <fig-llm-varys>

=== Tuning the Model Hyperparameters
For time reasons, we are using hyperparameters similar to the ones found by #pcite(<Wang_2020aa>), adapted to fit our network by hand. Our model performance could likely be improved by running our own hyperparameter search.

=== Non-Uniform Sampling of Interactions According to Real World Data
Currently, interactions with the voice assistants are sampled uniformly from a fixed dataset. According to a user survey, there are large differences in how often certain types of #gls("skill", display: "skills") are used @NPR_2022. For example, asking for a summary of the news is likely done much less frequently than playing music.

An experiment could be run where the system asks questions with a likelihood corresponding to how often they occur in the real world. This might improve the performance of the recognition system trained on such a dataset.

=== Performance Improvements
The main bottleneck currently preventing us from interacting without pausing is our #gls("stt") system. Looking at the speaking duration shown in @fig-average-duration, we estimate a lower bound of approximately 10 seconds per interaction. This entails the speaking duration plus the time waiting for a server response. With a more performant implementation of #gls("whisper"), a superior transcription model or better hardware to run the model on, we could thus collect up to six interactions per minute.

=== Other Uses for `varys`
During data analysis, we discovered certain #emph[easter eggs] in the answers that Siri gives. As an example, it responds to the query #emph(quote[Flip a coin]) with #emph(quote[It... whoops! It fell in a crack.]) about $1%$ of the time – a response that forces the user to ask again should they actually need an answer. Since this behaviour does not have any direct security or privacy implications, we did not pursue it further. Having said this, #gls("varys") does provide all the tools required to analyse these rare responses from a UX research point of view.

#pagebreak()
#heading(numbering: none)[Glossary]
This section contains words, concepts and abbreviations used in this thesis that warrant further explanation.

#columns(2)[
  #print-glossary((
    (
      key: "varys",
      short: [`varys`],
      desc: [The system built as part of this master's thesis. It includes all components described in @system-design.],
    ),
    (
      key: "skill",
      short: "skill",
      desc: [Voice assistant skills are the questions that a voice assistant can answer meaningfully and the tasks it can fulfil.],
    ),
    (
      key: "tts",
      short: "TTS",
      long: "text-to-speech",
      desc: [A TTS system turns written text into audio.],
    ),
    (
      key: "stt",
      short: "STT",
      long: "speech-to-text",
      desc: [A STT system turns audio recordings of speech into text. It is usually comprised of one part turning the audio into a phonetic representation and a system turning that into language.],
    ),
    (
      key: "whisper",
      short: [`whisper`],
      desc: [A machine learning #gls("stt") system proposed by OpenAI. See #link("https://openai.com/research/whisper") for details. We use the `whisper.cpp` implementation found at #link("https://github.com/ggerganov/whisper.cpp").],
    ),
    (
      key: "pcap",
      short: [`.pcap`],
      desc: [A file format for storing network packet captures.],
    ),
    (
      key: "cnn",
      short: "CNN",
      long: "Convolutional Neural Network",
    ),
    (
      key: "lstm",
      short: "LSTM",
      long: "Long Short-Term Memory",
    ),
    (
      key: "sae",
      short: "SAE",
      long: "Stacked AutoEncoder",
    ),
    (
      key: "ml",
      short: "ML",
      long: "machine learning"
    ),
    (
      key: "llm",
      short: "LLM",
      long: "Large Language Model",
      desc: [#v(1em, weak: true)#quote(attribution: "ChatGPT", block: true, quotes: true)[An advanced AI model trained on extensive text data to understand and produce human-like text, differing from traditional voice assistants by its ability to generate contextually rich responses rather than following pre-defined rules.]#v(1em, weak: true)]
    ),
    (
      key: "mitm",
      short: "MITM",
      long: "man-in-the-middle",
      desc: [A system relaying (and in some cases altering) communication between two parties that believe to be communicating directly.]
    ),
    (
      key: "wer",
      short: "WER",
      long: "word error rate",
      desc: [A metric for evaluating and comparing speech recognition systems based on string edit distance @Radford_2022aa.]
    ),
    (
      key: "rtf",
      short: $f_#text[rt]$,
      long: "real-time factor",
      desc: [A metric of how quickly a #gls("tts") system can synthesise speech. Calculated as $f_#text[rt] = T_s / T_a$, where $T_s$ is the time taken for synthesis and $T_a$ the duration of the synthesised audio. A system with $f_#text[rt] <= 1$ can generate speech without interruption.]
    ),
    (
      key: "iot",
      short: "IoT",
      long: "Internet of Things",
    ),
    (
      key: "personal requests",
      short: "personal requests",
      desc: [Interactions that access a user's personal information (e.g. #emph(quote[Read my unread messages.])). See #link("https://support.apple.com/en-gb/guide/homepod/apde0a4edb55/homepod") for more details.]
    ),
  ))
]

#pagebreak()
#bibliography("literature.bib", style: "ieee.csl", full: true)

#counter(heading).update(0)
#set heading(numbering: (..nums) => [Appendix #numbering("A:", ..nums)])

#pagebreak()
= SQL Queries
For the purposes of reproducibility, all relevant SQL queries used during data analysis are listed here.

#sql(<sql-large-dataset>)[Dataset of the 13 queries with approximately 2450 interactions each. These queries were chosen arbitrarily from different categories.][```
select query, count(query) from interaction
  where query in (
    'Hey Siri. What is the factorial of 6?', -- mathematics
    'Hey Siri. What day was 90 days ago?', -- time
    'Hey Siri. What is the temperature in living room?', -- home
    'Hey Siri. Any missed calls?', -- calls
    'Hey Siri. Read Calendar', -- calendar
    'Hey Siri. Remind me to wash the car', -- reminders
    'Hey Siri. How far is New York from Boston', -- maps
    'Hey Siri. How old is Ian McKellen?', -- trivia
    'Hey Siri. What’s the temperature outside?', -- weather
    'Hey Siri. Translate car from English to Spanish', -- translation
    'Hey Siri. Roll a die', -- randomness
    'Hey Siri. Is there a God?', -- banter
    'Hey Siri. What’s 2330 dollars in euros?' -- conversion
  )
  group by query;
```]

#sql(<sql-small-dataset>)[Dataset of all 240 queries that have over 100 interactions. Most of them have \~140 interactions and some have \~2450 interactions.][```
select query, count(query) from interaction
  group by query
  having count(query) > 100;
```]

#sql(<sql-binary-dataset>)[Dataset of the two queries "Call John Doe" and "Call Mary Poppins", each with approximately $1500$ interactions.][```
select query, count(query) from interaction
  where query in ('Hey Siri. Call John Doe', 'Hey Siri. Call Mary Poppins')
  group by query;
```]

#sql(<sql-average-duration>)[The average speaking duration in seconds of each query with at least 20 successful interactions.][```
select query, avg(response_duration + query_duration)/1000 as average_duration from interaction
  where response is not null and length(response) > 0
  group by query
  having count(query) > 20
  order by average_duration;
```]

#sql(<sql-average-remaining-duration>)[The average remaining duration in seconds of each query with at least 20 successful interactions.][```
select query, avg(
    extract(
      epoch from (select started from interaction where id = i.id + 1)
      - started
    )
    - (query_duration + response_duration) / 1000
  ) as average_duration from interaction as i
  where response is not null and length(response) > 0
  group by query
  having count(query) > 20
  order by average_duration;
```]

#sql(<sql-different-responses>)[Get all different responses for a query.][```
select response, count(*) from interaction
  where query = 'Hey Siri. [QUERY]'
  group by response, query
  order by count desc;
```]

#sql(<sql-empty-responses>)[Get the number of empty or non-empty responses for a query.][```
-- empty
select count(*) as empty from interaction
  where query = 'Hey Siri. [QUERY]'
    and (response is null or length(response) = 0);

-- non-empty
select count(*) as empty from interaction
  where query = 'Hey Siri. [QUERY]'
    and response is not null and length(response) > 0;
```]

#sql(<sql-success-rate>)[Get the success rate of our system.][```
select count(*)::float / (select count(*) from interaction)::float * 100
  from interaction
  where ended is not null;
```]

#pagebreak()
= Query Datasets <datasets>
#v(2em, weak: true)
#show figure: set block(breakable: true, width: 125%)
#set figure.caption(position: top)
#figure(caption: [Binary dataset with two queries.], text(size: 0.7em, table(
  columns: 2,
  inset: 2pt,
  stroke: 0.5pt + rgb("aaa"),
  text(hyphenate: false)[*Category*], [*Query*],
  `calls`, "Call John Doe",
  `calls`, "Call Mary Poppins",
))) <tab-dataset-binary>

#figure(caption: [Small dataset with 13 queries.], text(size: 0.7em, table(
  columns: 2,
  inset: 2pt,
  stroke: 0.5pt + rgb("aaa"),
  [*Category*], [*Query*],
  `conversion`, "What’s 2330 dollars in euros?",
  `mathematics`, "What is the factorial of 6?",
  `time`, "What day was 90 days ago?",
  `home`, "What is the temperature in living room?",
  `calls`, "Any missed calls?",
  `calendar`, "Read Calendar",
  `reminders`, "Remind me to wash the car",
  `maps`, "How far is New York from Boston",
  `trivia`, "How old is Ian McKellen?",
  `weather`, "What’s the temperature outside?",
  `translation`, "Translate car from English to Spanish",
  `randomness`, "Roll a die",
  `banter`, "Is there a God?",
))) <tab-dataset-small>

#figure(
  caption: [Large dataset with 240 queries grouped in 24 categories.],
  text(size: 0.7em, table(
    columns: 4,
    inset: 2pt,
    stroke: 0.5pt + rgb("aaa"),
    [*Category*], [*Query*], [*Category*], [*Query*],
    `conversion`, "What are 130 miles in yards?",
    `conversion`, "What’s 2330 dollars in euros?",
    `conversion`, "What's 200 pounds in kilograms?",
    `conversion`, "What's 45 miles per hour in meters per second?",
    `conversion`, "What are 3 gigabytes in megabytes?",
    `conversion`, "Convert 4.2 acres to square meters.",
    `conversion`, "Convert 250 milliliters to cups.",
    `conversion`, "Convert 180 degrees Celsius to Fahrenheit.",
    `conversion`, "Convert 3000 calories to kilojoules.",
    `conversion`, "Convert 75 miles per gallon to kilometers per liter.",
    `mathematics`, "What’s 9 plus 53?",
    `mathematics`, "What is 2 to the power of 17?",
    `mathematics`, "What is the result of 25 to the power of 4?",
    `mathematics`, "What is the factorial of 6?",
    `mathematics`, "What is 244 plus 5%?",
    `mathematics`, "What is $200 minus 21%?",
    `mathematics`, "What is 9 percent of 63?",
    `mathematics`, "What is the area of a circle with a radius of 2 meters?",
    `mathematics`, "What is the remainder when 27 is divided by 5?",
    `mathematics`, "Calculate the hypotenuse of a right triangle with legs 3 and 4.",
    `mathematics`, "Find the greatest common divisor of 48 and 36.",
    `time`, "What date is 90 days before December 17?",
    `time`, "What year is 39 years after 1994?",
    `time`, "What day was 90 days ago?",
    `time`, "How many years until 2049?",
    `time`, "How many days until Easter?",
    `time`, "How many days until Christmas?",
    `time`, "What are two hours five minutes and 39 seconds in seconds?",
    `time`, "What is the time zone in London?",
    `time`, "What time is it in London?",
    `time`, "Current time?",
    `home`, "Turn the lights blue",
    `home`, "Turn off the radio",
    `home`, "What is the temperature in living room?",
    `home`, "I’m home",
    `home`, "Set the brightness of the downstairs lights to 50%",
    `home`, "Lock the front door",
    `home`, "Open the garage",
    `contacts`, "John is my brother",
    `contacts`, "That’s not how you say John Doe",
    `contacts`, "Show John Doe",
    `contacts`, "When is John’s birthday?",
    `contacts`, "How old is my brother?",
    `contacts`, "Whose phone is this?",
    `contacts`, "Learn to pronounce my name",
    `calls`, "Call John",
    `calls`, "Call 408 555 1212",
    `calls`, "Call my brother on speakerphone",
    `calls`, "Call the nearest restaurant",
    `calls`, "When did my brother call me?",
    `calls`, "Play voicemail from John",
    `calls`, "Get my call history",
    `calls`, "Redial my last number",
    `calls`, "Any missed calls?",
    `calls`, "Call back my last missed call",
    `calls`, "Any new voicemail?",
    `calls`, "Play me my latest voicemail",
    `messages`, "Show me new messages from John Doe",
    `messages`, "Show me my messages",
    `messages`, "Read my messages",
    `messages`, "Text John Doe I’m in a meeting",
    `messages`, "Message my brother I’ll be late",
    `messages`, "Send John see you later",
    `messages`, "Tell John I’m on the way",
    `messages`, "Ask my brother Where are you?",
    `email`, "Any new email from John Doe?",
    `email`, "Show me the email from John Doe yesterday",
    `email`, "Send an email to John Doe Protocol",
    `email`, "Check email",
    `email`, "Read my last email",
    `social`, "Post to Facebook I’m eating a sandwich",
    `social`, "Post to Twitter Happy New Year!",
    `social`, "Tweet with my location very hot here",
    `social`, "Show me tweets from Twitter",
    `social`, "Show me the latest tweets",
    `calendar`, "Schedule an event Party in New York Wednesday at 10 PM",
    `calendar`, "Schedule a meeting at 1 PM tomorrow for 2 hours",
    `calendar`, "Create a recurring event every Saturday at 2:30 PM called Party",
    `calendar`, "Set up a meeting with John for today at 3 PM",
    `calendar`, "Read Calendar",
    `calendar`, "Show me my next appointment",
    `calendar`, "Where is my next meeting?",
    `calendar`, "Show me the appointments for this afternoon",
    `calendar`, "What does my calendar look like on Monday?",
    `calendar`, "When am I meeting with John Doe?",
    `calendar`, "Cancel my Party in New York event from tomorrow",
    `calendar`, "Cancel my event with John Doe",
    `calendar`, "Move my Monday meeting with John to 3 o’clock",
    `reminders`, "Remind me to wash the car",
    `reminders`, "Remind me on Friday at 10 PM to wash the car",
    `reminders`, "Add Milk to the Grocery list",
    `reminders`, "Remind me to wash the car when I leave home today",
    `reminders`, "Remind me to buy milk next time I’m here",
    `reminders`, "Remind me to wash the car every second week",
    `reminders`, "Delete the reminder wash the car",
    `reminders`, "Show me my Grocery list",
    `notes`, "Note 12 Dollars for pizza",
    `notes`, "Note Interesting Movies",
    `notes`, "Add 10 Dollars for food to Outcomes note",
    `notes`, "Add Star Wars to Interesting Movies note",
    `notes`, "Show me my notes",
    `notes`, "Show me my note Interesting Movies",
    `notes`, "Show me my notes from last week",
    `maps`, "Tell me about the traffic in New York",
    `maps`, "How far is New York from Boston",
    `maps`, "What are some attractions around here?",
    `maps`, "Where is Big Ben?",
    `maps`, "Is the Central Park open now?",
    `maps`, "Distance between here and New York?",
    `maps`, "How far away is Boston?",
    `maps`, "What is the nearest restaurant?",
    `maps`, "Find a Starbucks",
    `maps`, "Good Mexican restaurants around here",
    `maps`, "Table for two in Palo Alto tonight",
    `maps`, "Make a reservation at a romantic Italian restaurant tonight at 7 PM",
    `maps`, "Show me the reviews for Alexander’s Steakhouse in Cupertino",
    `alarms`, "Turn off my alarm",
    `alarms`, "Delete all alarms",
    `alarms`, "Turn off my Good Morning alarm",
    `alarms`, "Show me my alarms",
    `trivia`, "How old is Ian McKellen?",
    `trivia`, "Is Ian McKellen still alive?",
    `trivia`, "How tall is Ian McKellen?",
    `trivia`, "Where was Ian McKellen born?",
    `trivia`, "Who is Ian McKellen married to?",
    `trivia`, "Who wrote Harry Potter?",
    `trivia`, "Who invented the iPhone?",
    `trivia`, "How far away is the moon?",
    `trivia`, "How high is Mount Everest?",
    `trivia`, "What is the population of Switzerland?",
    `trivia`, "How many calories in a bagel?",
    `trivia`, "How long do dogs live?",
    `trivia`, "How many teeth does a dog have?",
    `trivia`, "What type of Pokémon is Pikachu?",
    `trivia`, "Spell necessary",
    `weather`, "What’s the weather like?",
    `weather`, "Do I need an umbrella for tomorrow?",
    `weather`, "What’s the weather going to be like in Madrid tomorrow?",
    `weather`, "Is there is a chance of rain tomorrow?",
    `weather`, "What’s the temperature outside?",
    `weather`, "What’s the perceived temperature outside?",
    `weather`, "What’s the dew point outside?",
    `weather`, "Is it windy outside?",
    `weather`, "What’s the pressure outside?",
    `weather`, "What’s the visibility outside?",
    `weather`, "What is the KP Index?",
    `weather`, "How humid is it outside?",
    `weather`, "When is the sunrise?",
    `weather`, "When is the sunset tomorrow?",
    `weather`, "When is the sunrise on Friday?",
    `weather`, "When is the sunset in New York?",
    `stocks`, "What’s the Apple stock price?",
    `stocks`, "Compare Apple with Alphabet",
    `definitions`, "Define airplane",
    `definitions`, "What is the definition of airplane?",
    `translation`, "Translate car from English to Spanish",
    `translation`, "What does the French word maison mean in English?",
    `media`, "Find books by Charles Dickens",
    `media`, "Find movies by Christopher Nolan",
    `media`, "What is the movie Indiana Jones about?",
    `media`, "When was Indiana Jones released?",
    `media`, "Runtime of Indiana Jones?",
    `media`, "Who acted in Indiana Jones?",
    `media`, "Movies with Scarlett Johansson",
    `media`, "Best thriller movies?",
    `media`, "Which movie won Best Picture in 1966?",
    `media`, "What movies are playing this evening?",
    `media`, "Buy three tickets to see The Lego Movie tonight in Sacramento",
    `media`, "Find some movie theaters near my home",
    `music`, "Shuffle my gym playlist",
    `music`, "What’s this song?",
    `music`, "Who sings this?",
    `music`, "I like this song",
    `sports`, "What is the point spread in the NFL game?",
    `sports`, "How is Chelsea doing?",
    `sports`, "Results from Liverpool last game?",
    `sports`, "Who’s going to win the Vikings game?",
    `sports`, "When is the next Liverpool game?",
    `sports`, "What Channel is the Royals game on?",
    `sports`, "When is the Super Bowl?",
    `randomness`, "Flip a coin",
    `randomness`, "Pick a card",
    `randomness`, "Roll a die",
    `randomness`, "Roll a twenty-sided die",
    `randomness`, "Random number between 30 and 60",
    `banter`, "See you on the seventh",
    `banter`, "What is 1 million divided by 0?",
    `banter`, "What is 0 divided by 0?",
    `banter`, "What is infinity times infinity?",
    `banter`, "Rock paper scissors",
    `banter`, "Sudo make me a sandwich",
    `banter`, "Tell me a joke",
    `banter`, "Tell haiku",
    `banter`, "Tell me a tongue twister",
    `banter`, "Tell me a story",
    `banter`, "Tell me a poem",
    `banter`, "Tell me a secret",
    `banter`, "Tell me a bedtime story",
    `banter`, "Sing me a lullaby",
    `banter`, "Beam me up",
    `banter`, "Guess what",
    `banter`, "Who’s on first?",
    `banter`, "Open the pod bay doors",
    `banter`, "Sing me a song now",
    `banter`, "When is your birthday?",
    `banter`, "What’s your sign?",
    `banter`, "What’s your favourite animal?",
    `banter`, "What color is your hair?",
    `banter`, "How much do you weigh?",
    `banter`, "Are you smart?",
    `banter`, "Are you perfect?",
    `banter`, "Do you think I look fat in this?",
    `banter`, "Will you marry me?",
    `banter`, "May the force be with you",
    `banter`, "Can I call you Jarvis?",
    `banter`, "When do you sleep?",
    `banter`, "How is it to be you?",
    `banter`, "Have you seen Star Wars?",
    `banter`, "What is your favourite colour?",
    `banter`, "What are you going to be for Halloween?",
    `banter`, "Do you know pick up lines?",
    `banter`, "Mirror mirror on the wall, who’s the fairest of them all?",
    `banter`, "What does the fox say?",
    `banter`, "Who let the dogs out?",
    `banter`, "How much wood could a woodchuck chuck if a woodchuck could chuck wood?",
    `banter`, "What is the airspeed velocity of an unladen swallow?",
    `banter`, "Why are fire trucks red?",
    `banter`, "Why did the chicken cross the road?",
    `banter`, "What is the meaning of life?",
    `banter`, "Is there a God?",
    `banter`, "When is the end of the world?",
    `banter`, "What’s the best phone?",
    `banter`, "Can I borrow some money?",
    `banter`, "supercalifragilisticexpialidocious",
    `banter`, "Rap Beatbox",
    `banter`, "Can I call you Cortana?",
    `banter`, "You’re the best",
    `banter`, "Meow",
    `banter`, "I’m sleepy",
    `banter`, "How many languages do you speak?",
  )
)) <tab-dataset-large>