Files
varys-presentation/main.typ
2024-04-11 13:05:55 +02:00

550 lines
17 KiB
Typst
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#import "@preview/polylux:0.3.1": *
#import "university-custom.typ": *
#import "@preview/fletcher:0.4.1" as fletcher: node, edge
#import "@preview/cetz:0.2.0"
#let mint = rgb("a5d7d2")
#let mint-hell = rgb("d2ebe9")
#let rot = rgb("d20537")
#let anthrazit = rgb("2d373c")
#let anthrazit-hell = rgb("46505a")
#show quote: it => {
if it.block {
it
} else {
emph(it)
}
}
// diagrams
#let edge_(..args) = edge(..args, marks: (none, "|>")) // we can't use set rules for user defined functions yet, so we overwrite it
#let group_node(colour, ..args) = node(
..args,
stroke: colour,
fill: colour.lighten(80%),
)
#let group_edge(colour, description, ..args) = edge(
..args,
text(colour)[#description],
"..",
stroke: colour,
label-side: left,
)
#let enclose_nodes(nodes, colour, clearance: (8pt, 8pt, 8pt, 8pt)) = {
let (center, size) = fletcher.bounding-rect(nodes.map(node => node.real-pos))
center.at(0) = center.at(0) - (clearance.at(3) - clearance.at(1))
center.at(1) = center.at(1) - (clearance.at(2) - clearance.at(0))
cetz.draw.content(
center,
rect(
width: size.at(0) + clearance.at(1) + clearance.at(3),
height: size.at(1) + clearance.at(2) + clearance.at(0),
radius: 16pt,
stroke: colour,
fill: colour.lighten(85%),
)
)
}
// functions
#let focus-title-slide(title) = {
focus-slide(background-color: white)[
#align(center, text(fill: anthrazit, size: 0.7em, weight: "bold", title))
]
}
// #pdfpc.config(
// duration-minutes: 30,
// // start-time: "10:15",
// // end-time: "10:45",
// // note-font-size: 24,
// )
#show: university-theme.with(
short-title: [A Testbed for Voice Assistant Traffic Fingerprinting],
short-author: [Milan van Zanten],
color-a: anthrazit,
color-b: mint,
)
#title-slide(
title: [A Testbed for Voice Assistant \ Traffic Fingerprinting],
subtitle: [Master Thesis Presentation],
authors: [Milan van Zanten],
date: [21.03.2024],
institution-name: [University of Basel],
logo: pad(1em, image("unibas-logo.svg"))
)
#slide(title: [Outline])[
+ Voice Assistants
+ Traffic Fingerprinting
+ Testbed
+ Results
+ Demo
Ask questions any time!
]
#focus-title-slide[
A Testbed for #text(fill: rot)[Voice Assistant] \ Traffic Fingerprinting
]
#slide(title: [Devices], new-section: [Voice Assistants])[
#pdfpc.speaker-note("Three main VAs")
Specifically, #emph[Smart Speakers]
#only(1)[
#side-by-side[
#align(center)[
#image(width: 50%, "images/alexa.jpg")
Echo Dot
#emph[Amazon Alexa]
]
][
#align(center)[
#image(width: 50%, "images/siri.jpg")
HomePod Mini
#emph[Siri]
]
][
#align(center)[
#image(width: 50%, "images/google-assistant.jpg")
Google Home Mini
#emph[Google Assistant]
]
]
]
#only(2)[
#side-by-side[
#align(center)[
#image(width: 40%, "images/alexa.jpg")
#text(size: .6em)[
Echo Dot
#emph[Amazon Alexa]
]
]
][
#align(center)[
#image(width: 80%, "images/siri.jpg")
HomePod Mini
#emph[Siri]
]
][
#align(center)[
#image(width: 40%, "images/google-assistant.jpg")
#text(size: .6em)[
Google Home Mini
#emph[Google Assistant]
]
]
]
]
]
#slide(title: [Smart Speaker Privacy / Security])[
#counter(footnote).update(0) // reset footnote counter
#pdfpc.speaker-note("Why do we want to look at smart speakers?")
#pdfpc.speaker-note("Alexa; guest voice wrongly recognised 65%")
#alternatives(repeat-last: true)[
none #h(1em) `(╯°□°)╯︵ ┻━┻`
][
There are concerns...
]
#pause
- Usually located where sensitive conversations take place
- Necessarily always listening
- Misactivations
- Used to control smart home devices (e.g. door locks)
- No authentication\*
#uncover(3)[About 40% of households in the U.S. own a smart speaker.]
#v(1em)
#text(0.6em)[\* Voice recognition is still insecure.]
]
#slide(title: [Attacks on Smart Speakers])[
#pdfpc.speaker-note("skill squatting; specific to alexa, skills basically apps")
#pdfpc.speaker-note("Boil an egg is an existing skill")
Active:
- Malicious activations
- Similar pronounciations, "skill squatting"
- (e.g. "Boil an egg" $->$ "Boyle an egg")#footnote[D. Kumar et al., #quote[Skill Squatting Attacks on Amazon Alexa], August 2018, Available: #link("https://www.usenix.org/conference/usenixsecurity18/presentation/kumar")]
Passive:
- #alternatives[Traffic Fingerprinting][#text(fill: rot)[Traffic Fingerprinting]]
]
#focus-title-slide[
A Testbed for Voice Assistant \ #text(fill: rot)[Traffic Fingerprinting]
]
#slide(title: [], new-section: [Traffic Fingerprinting])[
#pdfpc.speaker-note("Just after the release of SSL 3.0")
#pdfpc.speaker-note("David Wagner and Bruce Schneier")
#pdfpc.speaker-note("Probably one of the first mentions")
#v(2em)
#quote(attribution: [Wagner and Schneier#footnote[D. Wagner and B. Schneier, #quote[Analysis of the SSL 3.0 Protocol], November 1996, Available: #link("https://dl.acm.org/doi/10.5555/1267167.1267171")]], block: true, quotes: true)[[SSL] traffic analysis aims to recover confidential information about protection sessions by examining unencrypted packet fields and #alternatives(repeat-last: true)[unprotected packet attributes][#text(fill: rot)[unprotected packet attributes]]. For example [...] the volume of network traffic flow]
#uncover(3)[... packet direction, timing, and more]
]
#slide(title: [Timeline])[
#counter(footnote).update(0) // reset footnote counter
#pdfpc.speaker-note("Relevant developments in my opinion")
#pdfpc.speaker-note("Abe and Goto; Denoising Autoencoder on Tor traffic")
#pdfpc.speaker-note("Mao et al.; time between packets")
#pdfpc.speaker-note("Ahmed, Sabir and Das; so far all attacks assumed known traffic window, allowed them to do end-to-end")
/ 1996: Wagner and Schneier#footnote[Timeline references can be found at the end of the presentation.], #text(fill: rot)[coined SSL traffic analysis]
/ 1998: Cheng and Avnur, #text(fill: rot)[website traffic analysis]
#h(4em) #text(fill: rot)[website fingerprinting (WF)...]
/ 2016: Abe and Goto, #text(fill: rot)[deep learning WF]
/ 2019: Kennedy et al., #text(fill: rot)[apply WF techniques to voice assistants (VA)]
/ 2020: Wang et al., #text(fill: rot)[deep learning VA fingerprinting]
/ 2022: Mao et al., #text(fill: rot)[temporal features]
/ 2023: Ahmed, Sabir and Das, #text(fill: rot)[invocation detection]
]
#slide(title: [Threat Model])[
#pdfpc.speaker-note("2. to filter traffic")
#align(center, text(size: 0.7em,
fletcher.diagram(
node-stroke: 1pt,
edge-stroke: 1pt,
node-corner-radius: 8pt,
edge-corner-radius: 8pt,
node-fill: rgb("eee"),
spacing: 3em,
node((-2, 0), `VA Server`),
edge(`WAN`),
node((0, 0), `Gateway`),
edge("rr", `LAN`),
edge("r", "d", "r"),
node((2, 0), `Smart Speaker`),
node((2, 1), `Other Devices`),
group_node(rot, (1, -1), `Attacker`),
edge((1, -0.4), text(fill: rot)[`Intercept`], stroke: rot),
)
))
+ The attacker can intercept traffic from smart speaker
+ The attacker knows the smart speaker address
+ The attacker knows the type of smart speaker used
+ The attacker knows the beginning and end of an interaction
]
#slide(title: [Closed-World])[
#v(2em)
- Fixed list of monitored voice commands
- Traffic is considered to come from one of the monitored commands
- Multiclass classification
Predicts which command was used.
]
#slide(title: [Open-World])[
#v(2em)
- Traffic can also come from new, unmonitored commands
- Binary-classification
Predicts whether traffic is from monitored or unmonitored command.
]
#slide(title: [Combining Both Models])[
#v(2em)
#align(center, text(size: 0.7em,
fletcher.diagram(
node-stroke: 1pt,
edge-stroke: 1pt,
node-corner-radius: 8pt,
edge-corner-radius: 8pt,
node-fill: rgb("eee"),
spacing: 3em,
node((0, 0), `traffic`),
edge_(),
node((1, 0), `open-world classification`),
edge_("d", `unmonitored`),
edge_(`monitored`),
node((3, 0), `closed-world classification`),
edge_(),
node((3, 1), `prediction`),
node((1, 1), `unknown`),
)
))
]
#focus-title-slide[
A #text(fill: rot)[Testbed] for Voice Assistant \ Traffic Fingerprinting
]
#slide(title: [Comparison], new-section: [Testbed])[
#pdfpc.speaker-note("Why? Let's look at Website Traffic Fingerprinting")
#side-by-side[
Website Fingerprinting:
- Requires a large amount of data
- Data collection usually via program making requests
- Only dependent on network environment
- Fast
][
Voice Command Fingerprinting: #pause
- Requires a large amount of data #pause
- Interaction by speaking \ ~ #pause
- Hampered by environment noise \ ~ #pause
- Slow and inefficient #pause
#text(fill: rot)[$->$ Sophisticated testbed]
]
]
#slide(title: [])[
#counter(footnote).update(0) // reset footnote counter
#pdfpc.speaker-note("Found in one of the newer papers")
#v(4em)
#quote(attribution: [Mao et al.#footnote[Jianghan Mao et al., #quote[A novel model for voice command fingerprinting using deep learning], March 2022, Available: #link("https://doi.org/10.1016/j.jisa.2021.103085")]], block: true, quotes: true)[The content of voice commands may vary from date to date; therefore, more efficient data collection tools need to be developed.]
]
#slide(title: [Requirements])[
- Sound isolation
- Isolated box
- Separate speaker/microphone
- Efficiency
- Every second saved per interaction means hours saved when collecting dozens of thousands interactions
- Dynamic interaction length by listening for silence
- Robustness
- Autonomously reset VA if error occurs
- Monitoring system
]
#slide(title: [System])[
#text(size: 0.8em)[
/ `varys`: The main executable combining all modules into the final system.
/ `varys-analysis`: Analysis of data collected by varys.
/ `varys-audio`: Recording audio and the TTS and STT systems.
/ `varys-database`: Abstraction of the database system where interactions are stored.
/ `varys-network`: Collection of network traffic, writing and parsing of `.pcap` files.
]
#v(1em)
#align(center, text(size: 0.7em)[
#fletcher.diagram(
node-stroke: 1pt,
edge-stroke: 1pt,
node-corner-radius: 4pt,
edge-corner-radius: 4pt,
spacing: 1.5em,
node((0, 0), `varys`, fill: rgb("eee")),
edge_("r"),
edge_("dd", "l", "d"),
edge_("ddd"),
edge_("dd", "r", "d"),
group_node(anthrazit, (1, 0), "varys-analysis"),
edge_("d", (0.12, 1), (0.12, 2.625)),
edge_("d", (1.2, 1), (1.2, 2.625)),
group_node(anthrazit, (-1, 3), "varys-audio"),
group_node(anthrazit, (0, 3), "varys-network"),
group_node(anthrazit, (1, 3), "varys-database"),
)
])
]
#focus-title-slide[
Results
]
#slide(title: [Datasets], new-section: [Results])[
#v(2em)
\~800h, \~70'000 interactions
~
/ ` large`: 227 queries, 140 interactions each
/ ` small`: 13 queries, 2400 interactions each
/ `binary`: #quote[Call John Doe] and #quote[Call Mary Poppins], 1500 interactions each
]
#slide(title: [Efficiency])[
#pdfpc.speaker-note("Some outliers like \"tell me a story\" at >1min")
#align(center, text(size: 0.8em)[
#cetz.canvas({
import cetz.draw: *
import cetz.plot
let data = csv("csv/aggregate_average_duration.csv").map(item => {
(float(item.at(0)), int(item.at(1)))
})
let data_remaining = csv("csv/aggregate_average_remaining_duration.csv").map(item => {
(float(item.at(0)), int(item.at(1)))
})
set-style(legend: (padding: 5pt, item: (spacing: 10pt)))
plot.plot(
size: (24, 8),
axis-style: "scientific-auto",
legend: "legend.inner-north",
x-label: "average duration [s]",
x-tick-step: 2,
x-min: 0,
x-max: 36,
y-label: "queries",
y-tick-step: 1,
y-min: 0,
{
for item in data {
plot.add(
((item.at(0), 0), (..item)),
style: (stroke: 2pt + mint),
)
}
plot.add(
((100, 0), (100, 1)),
style: (stroke: 8pt + mint),
label: "Avg. Speaking Duration",
)
for item in data_remaining {
plot.add(
((item.at(0), 0), (..item)),
style: (stroke: 2pt + rot),
)
}
plot.add(
((100, 0), (100, 1)),
style: (stroke: 8pt + rot),
label: "Avg. Remaining Duration",
)
})
})
])
]
#slide(title: [Traffic Trace Examples])[
#side-by-side[
#image(width: 100%, height: 89%, "images/plots/plot-Hey Siri. Any missed calls.png")
#v(10pt, weak: true)
#quote[Any missed calls?]
][
#image(width: 100%, height: 89%, "images/plots/plot-Hey Siri. What day was 90 days ago.png")
#v(10pt, weak: true)
#quote[What day was 90 days ago?]
]
]
#slide(title: [Fingerprinting Model])[
#counter(footnote).update(0) // reset footnote counter
#text(size: 0.96em)[
Feature extraction #text(size: 0.8em)[(packet size $s in [0, 1500]$ and direction $d in {0, 1}$)]:
$$$
(s, d) -> (-1)^(d) dot s/1500 #h(0.8em)
$$$
CNN adapted from Wang et al.#footnote[Chenggang Wang et al., #quote[Fingerprinting Encrypted Voice Traffic on Smart Speakers with Deep Learning], May 2020, Available: #link("https://doi.org/10.1145/3395351.3399357")]:
]
#text(size: 0.65em)[
#fletcher.diagram(
node-stroke: 1pt,
node-fill: rgb("eee"),
edge-stroke: 1pt,
node-corner-radius: 4pt,
edge-corner-radius: 4pt,
spacing: 1.5em,
group_node(rot, (0, 0), align(center)[Input]),
edge_(),
node((1, 0), align(center)[Conv. Layer \ `[tanh]`]),
edge_(),
node((2, 0), align(center)[Dropout \ $0.1$]),
edge_(),
node((3, 0), align(center)[Global Average \ Pooling]),
edge_(),
node((4, 0), align(center)[Dense Layer \ `[elu]`]),
edge_(),
node((5, 0), align(center)[Dense Layer \ `[softmax]`]),
edge_(),
group_node(rot, (6, 0), align(center)[Output]),
render: (grid, nodes, edges, options) => {
let cnn_1 = (nodes.at(1), nodes.at(2), nodes.at(3))
cetz.canvas({
enclose_nodes(cnn_1, rgb(0, 0, 0, 50%), clearance: (34pt, 64pt, 34pt, 61pt))
fletcher.draw-diagram(grid, nodes, edges, options)
})
}
)
]
]
#slide(title: [Classification])[
#v(2em)
Accuracy on test sets:
/ ` large`: \~40.40% (random choice \~0.44%)
/ ` small`: \~86.19% (random choice \~7.69%)
/ `binary`: \~71.19% (random choice 50%)
]
#slide(title: [Demo], new-section: [])[
#pdfpc.speaker-note("Since I've trained the model on my laptop, I used a smaller network.")
#text(size: 0.66em)[
```sh
./target/release/varys -i ap1 analyse demo data/ml/test_5_13\ queries_0.86 f4:34:f0:89:2d:75
```
]
#quote[Hey Siri, any missed calls?]
#quote[Hey Siri, remind me to wash the car.]
#v(1em)
#text(size: 0.66em)[It is unlikely this will work...]
]
#slide(header: pad(left: 1em, top: 0.5em, heading[Timeline References]))[
<references>
#text(size: 0.7em)[
- D. Wagner and B. Schneier, #quote[Analysis of the SSL 3.0 Protocol], November 1996, Available: #link("https://dl.acm.org/doi/10.5555/1267167.1267171")
- H. Cheng and R. Avnur, #quote[Traffic Analysis of SSL Encrypted Web Browsing], 1998
- K. Abe and S. Goto, #quote[Fingerprinting Attack on Tor Anonymity using Deep Learning], August 2016, Available: #link("https://core.ac.uk/display/229876143")
- S. Kennedy et al., #quote[I Can Hear Your Alexa: Voice Command Fingerprinting on Smart Home Speakers], June 2019, Available: #link("https://doi.org/10.1109/CNS.2019.8802686")
- Chenggang Wang et al., #quote[Fingerprinting Encrypted Voice Traffic on Smart Speakers with Deep Learning], May 2020, Available: #link("https://doi.org/10.1145/3395351.3399357")
- Jianghan Mao et al., #quote[A novel model for voice command fingerprinting using deep learning], March 2022, Available: #link("https://doi.org/10.1016/j.jisa.2021.103085")
- D. Ahmed, A. Sabir, and A. Das, #quote[Spying through Your Voice Assistants: Realistic Voice Command Fingerprinting], August 2023, Available: #link("https://www.usenix.org/conference/usenixsecurity23/presentation/ahmed-dilawer")
]
]