#import "@preview/polylux:0.3.1": * #import "university-custom.typ": * #import "@preview/fletcher:0.4.1" as fletcher: node, edge #import "@preview/cetz:0.2.0" #let mint = rgb("a5d7d2") #let mint-hell = rgb("d2ebe9") #let rot = rgb("d20537") #let anthrazit = rgb("2d373c") #let anthrazit-hell = rgb("46505a") #show quote: it => { if it.block { it } else { emph(it) } } // diagrams #let edge_(..args) = edge(..args, marks: (none, "|>")) // we can't use set rules for user defined functions yet, so we overwrite it #let group_node(colour, ..args) = node( ..args, stroke: colour, fill: colour.lighten(80%), ) #let group_edge(colour, description, ..args) = edge( ..args, text(colour)[#description], "..", stroke: colour, label-side: left, ) #let enclose_nodes(nodes, colour, clearance: (8pt, 8pt, 8pt, 8pt)) = { let (center, size) = fletcher.bounding-rect(nodes.map(node => node.real-pos)) center.at(0) = center.at(0) - (clearance.at(3) - clearance.at(1)) center.at(1) = center.at(1) - (clearance.at(2) - clearance.at(0)) cetz.draw.content( center, rect( width: size.at(0) + clearance.at(1) + clearance.at(3), height: size.at(1) + clearance.at(2) + clearance.at(0), radius: 16pt, stroke: colour, fill: colour.lighten(85%), ) ) } // functions #let focus-title-slide(title) = { focus-slide(background-color: white)[ #align(center, text(fill: anthrazit, size: 0.7em, weight: "bold", title)) ] } // #pdfpc.config( // duration-minutes: 30, // // start-time: "10:15", // // end-time: "10:45", // // note-font-size: 24, // ) #show: university-theme.with( short-title: [A Testbed for Voice Assistant Traffic Fingerprinting], short-author: [Milan van Zanten], color-a: anthrazit, color-b: mint, ) #title-slide( title: [A Testbed for Voice Assistant \ Traffic Fingerprinting], subtitle: [Master Thesis Presentation], authors: [Milan van Zanten], date: [21.03.2024], institution-name: [University of Basel], logo: pad(1em, image("unibas-logo.svg")) ) #slide(title: [Outline])[ + Voice Assistants + Traffic Fingerprinting + Testbed + Results + Demo Ask questions any time! ] #focus-title-slide[ A Testbed for #text(fill: rot)[Voice Assistant] \ Traffic Fingerprinting ] #slide(title: [Devices], new-section: [Voice Assistants])[ #pdfpc.speaker-note("Three main VAs") Specifically, #emph[Smart Speakers] #only(1)[ #side-by-side[ #align(center)[ #image(width: 50%, "images/alexa.jpg") Echo Dot #emph[Amazon Alexa] ] ][ #align(center)[ #image(width: 50%, "images/siri.jpg") HomePod Mini #emph[Siri] ] ][ #align(center)[ #image(width: 50%, "images/google-assistant.jpg") Google Home Mini #emph[Google Assistant] ] ] ] #only(2)[ #side-by-side[ #align(center)[ #image(width: 40%, "images/alexa.jpg") #text(size: .6em)[ Echo Dot #emph[Amazon Alexa] ] ] ][ #align(center)[ #image(width: 80%, "images/siri.jpg") HomePod Mini #emph[Siri] ] ][ #align(center)[ #image(width: 40%, "images/google-assistant.jpg") #text(size: .6em)[ Google Home Mini #emph[Google Assistant] ] ] ] ] ] #slide(title: [Smart Speaker Privacy / Security])[ #counter(footnote).update(0) // reset footnote counter #pdfpc.speaker-note("Why do we want to look at smart speakers?") #pdfpc.speaker-note("Alexa; guest voice wrongly recognised 65%") #alternatives(repeat-last: true)[ none #h(1em) `(╯°□°)╯︵ ┻━┻` ][ There are concerns... ] #pause - Usually located where sensitive conversations take place - Necessarily always listening - Misactivations - Used to control smart home devices (e.g. door locks) - No authentication\* #uncover(3)[About 40% of households in the U.S. own a smart speaker.] #v(1em) #text(0.6em)[\* Voice recognition is still insecure.] ] #slide(title: [Attacks on Smart Speakers])[ #pdfpc.speaker-note("skill squatting; specific to alexa, skills basically apps") #pdfpc.speaker-note("Boil an egg is an existing skill") Active: - Malicious activations - Similar pronounciations, "skill squatting" - (e.g. "Boil an egg" $->$ "Boyle an egg")#footnote[D. Kumar et al., #quote[Skill Squatting Attacks on Amazon Alexa], August 2018, Available: #link("https://www.usenix.org/conference/usenixsecurity18/presentation/kumar")] Passive: - #alternatives[Traffic Fingerprinting][#text(fill: rot)[Traffic Fingerprinting]] ] #focus-title-slide[ A Testbed for Voice Assistant \ #text(fill: rot)[Traffic Fingerprinting] ] #slide(title: [], new-section: [Traffic Fingerprinting])[ #pdfpc.speaker-note("Just after the release of SSL 3.0") #pdfpc.speaker-note("David Wagner and Bruce Schneier") #pdfpc.speaker-note("Probably one of the first mentions") #v(2em) #quote(attribution: [Wagner and Schneier#footnote[D. Wagner and B. Schneier, #quote[Analysis of the SSL 3.0 Protocol], November 1996, Available: #link("https://dl.acm.org/doi/10.5555/1267167.1267171")]], block: true, quotes: true)[[SSL] traffic analysis aims to recover confidential information about protection sessions by examining unencrypted packet fields and #alternatives(repeat-last: true)[unprotected packet attributes][#text(fill: rot)[unprotected packet attributes]]. For example [...] the volume of network traffic flow] #uncover(3)[... packet direction, timing, and more] ] #slide(title: [Timeline])[ #counter(footnote).update(0) // reset footnote counter #pdfpc.speaker-note("Relevant developments in my opinion") #pdfpc.speaker-note("Abe and Goto; Denoising Autoencoder on Tor traffic") #pdfpc.speaker-note("Mao et al.; time between packets") #pdfpc.speaker-note("Ahmed, Sabir and Das; so far all attacks assumed known traffic window, allowed them to do end-to-end") / 1996: Wagner and Schneier#footnote[Timeline references can be found at the end of the presentation.], #text(fill: rot)[coined SSL traffic analysis] / 1998: Cheng and Avnur, #text(fill: rot)[website traffic analysis] #h(4em) #text(fill: rot)[website fingerprinting (WF)...] / 2016: Abe and Goto, #text(fill: rot)[deep learning WF] / 2019: Kennedy et al., #text(fill: rot)[apply WF techniques to voice assistants (VA)] / 2020: Wang et al., #text(fill: rot)[deep learning VA fingerprinting] / 2022: Mao et al., #text(fill: rot)[temporal features] / 2023: Ahmed, Sabir and Das, #text(fill: rot)[invocation detection] ] #slide(title: [Threat Model])[ #pdfpc.speaker-note("2. to filter traffic") #align(center, text(size: 0.7em, fletcher.diagram( node-stroke: 1pt, edge-stroke: 1pt, node-corner-radius: 8pt, edge-corner-radius: 8pt, node-fill: rgb("eee"), spacing: 3em, node((-2, 0), `VA Server`), edge(`WAN`), node((0, 0), `Gateway`), edge("rr", `LAN`), edge("r", "d", "r"), node((2, 0), `Smart Speaker`), node((2, 1), `Other Devices`), group_node(rot, (1, -1), `Attacker`), edge((1, -0.4), text(fill: rot)[`Intercept`], stroke: rot), ) )) + The attacker can intercept traffic from smart speaker + The attacker knows the smart speaker address + The attacker knows the type of smart speaker used + The attacker knows the beginning and end of an interaction ] #slide(title: [Closed-World])[ #v(2em) - Fixed list of monitored voice commands - Traffic is considered to come from one of the monitored commands - Multiclass classification Predicts which command was used. ] #slide(title: [Open-World])[ #v(2em) - Traffic can also come from new, unmonitored commands - Binary-classification Predicts whether traffic is from monitored or unmonitored command. ] #slide(title: [Combining Both Models])[ #v(2em) #align(center, text(size: 0.7em, fletcher.diagram( node-stroke: 1pt, edge-stroke: 1pt, node-corner-radius: 8pt, edge-corner-radius: 8pt, node-fill: rgb("eee"), spacing: 3em, node((0, 0), `traffic`), edge_(), node((1, 0), `open-world classification`), edge_("d", `unmonitored`), edge_(`monitored`), node((3, 0), `closed-world classification`), edge_(), node((3, 1), `prediction`), node((1, 1), `unknown`), ) )) ] #focus-title-slide[ A #text(fill: rot)[Testbed] for Voice Assistant \ Traffic Fingerprinting ] #slide(title: [Comparison], new-section: [Testbed])[ #pdfpc.speaker-note("Why? Let's look at Website Traffic Fingerprinting") #side-by-side[ Website Fingerprinting: - Requires a large amount of data - Data collection usually via program making requests - Only dependent on network environment - Fast ][ Voice Command Fingerprinting: #pause - Requires a large amount of data #pause - Interaction by speaking \ ~ #pause - Hampered by environment noise \ ~ #pause - Slow and inefficient #pause #text(fill: rot)[$->$ Sophisticated testbed] ] ] #slide(title: [])[ #counter(footnote).update(0) // reset footnote counter #pdfpc.speaker-note("Found in one of the newer papers") #v(4em) #quote(attribution: [Mao et al.#footnote[Jianghan Mao et al., #quote[A novel model for voice command fingerprinting using deep learning], March 2022, Available: #link("https://doi.org/10.1016/j.jisa.2021.103085")]], block: true, quotes: true)[The content of voice commands may vary from date to date; therefore, more efficient data collection tools need to be developed.] ] #slide(title: [Requirements])[ - Sound isolation - Isolated box - Separate speaker/microphone - Efficiency - Every second saved per interaction means hours saved when collecting dozens of thousands interactions - Dynamic interaction length by listening for silence - Robustness - Autonomously reset VA if error occurs - Monitoring system ] #slide(title: [System])[ #text(size: 0.8em)[ / `varys`: The main executable combining all modules into the final system. / `varys-analysis`: Analysis of data collected by varys. / `varys-audio`: Recording audio and the TTS and STT systems. / `varys-database`: Abstraction of the database system where interactions are stored. / `varys-network`: Collection of network traffic, writing and parsing of `.pcap` files. ] #v(1em) #align(center, text(size: 0.7em)[ #fletcher.diagram( node-stroke: 1pt, edge-stroke: 1pt, node-corner-radius: 4pt, edge-corner-radius: 4pt, spacing: 1.5em, node((0, 0), `varys`, fill: rgb("eee")), edge_("r"), edge_("dd", "l", "d"), edge_("ddd"), edge_("dd", "r", "d"), group_node(anthrazit, (1, 0), "varys-analysis"), edge_("d", (0.12, 1), (0.12, 2.625)), edge_("d", (1.2, 1), (1.2, 2.625)), group_node(anthrazit, (-1, 3), "varys-audio"), group_node(anthrazit, (0, 3), "varys-network"), group_node(anthrazit, (1, 3), "varys-database"), ) ]) ] #focus-title-slide[ Results ] #slide(title: [Datasets], new-section: [Results])[ #v(2em) \~800h, \~70'000 interactions ~ / ` large`: 227 queries, 140 interactions each / ` small`: 13 queries, 2400 interactions each / `binary`: #quote[Call John Doe] and #quote[Call Mary Poppins], 1500 interactions each ] #slide(title: [Efficiency])[ #pdfpc.speaker-note("Some outliers like \"tell me a story\" at >1min") #align(center, text(size: 0.8em)[ #cetz.canvas({ import cetz.draw: * import cetz.plot let data = csv("csv/aggregate_average_duration.csv").map(item => { (float(item.at(0)), int(item.at(1))) }) let data_remaining = csv("csv/aggregate_average_remaining_duration.csv").map(item => { (float(item.at(0)), int(item.at(1))) }) set-style(legend: (padding: 5pt, item: (spacing: 10pt))) plot.plot( size: (24, 8), axis-style: "scientific-auto", legend: "legend.inner-north", x-label: "average duration [s]", x-tick-step: 2, x-min: 0, x-max: 36, y-label: "queries", y-tick-step: 1, y-min: 0, { for item in data { plot.add( ((item.at(0), 0), (..item)), style: (stroke: 2pt + mint), ) } plot.add( ((100, 0), (100, 1)), style: (stroke: 8pt + mint), label: "Avg. Speaking Duration", ) for item in data_remaining { plot.add( ((item.at(0), 0), (..item)), style: (stroke: 2pt + rot), ) } plot.add( ((100, 0), (100, 1)), style: (stroke: 8pt + rot), label: "Avg. Remaining Duration", ) }) }) ]) ] #slide(title: [Traffic Trace Examples])[ #side-by-side[ #image(width: 100%, height: 89%, "images/plots/plot-Hey Siri. Any missed calls.png") #v(10pt, weak: true) #quote[Any missed calls?] ][ #image(width: 100%, height: 89%, "images/plots/plot-Hey Siri. What day was 90 days ago.png") #v(10pt, weak: true) #quote[What day was 90 days ago?] ] ] #slide(title: [Fingerprinting Model])[ #counter(footnote).update(0) // reset footnote counter #text(size: 0.96em)[ Feature extraction #text(size: 0.8em)[(packet size $s in [0, 1500]$ and direction $d in {0, 1}$)]: $$$ (s, d) -> (-1)^(d) dot s/1500 #h(0.8em) $$$ CNN adapted from Wang et al.#footnote[Chenggang Wang et al., #quote[Fingerprinting Encrypted Voice Traffic on Smart Speakers with Deep Learning], May 2020, Available: #link("https://doi.org/10.1145/3395351.3399357")]: ] #text(size: 0.65em)[ #fletcher.diagram( node-stroke: 1pt, node-fill: rgb("eee"), edge-stroke: 1pt, node-corner-radius: 4pt, edge-corner-radius: 4pt, spacing: 1.5em, group_node(rot, (0, 0), align(center)[Input]), edge_(), node((1, 0), align(center)[Conv. Layer \ `[tanh]`]), edge_(), node((2, 0), align(center)[Dropout \ $0.1$]), edge_(), node((3, 0), align(center)[Global Average \ Pooling]), edge_(), node((4, 0), align(center)[Dense Layer \ `[elu]`]), edge_(), node((5, 0), align(center)[Dense Layer \ `[softmax]`]), edge_(), group_node(rot, (6, 0), align(center)[Output]), render: (grid, nodes, edges, options) => { let cnn_1 = (nodes.at(1), nodes.at(2), nodes.at(3)) cetz.canvas({ enclose_nodes(cnn_1, rgb(0, 0, 0, 50%), clearance: (34pt, 64pt, 34pt, 61pt)) fletcher.draw-diagram(grid, nodes, edges, options) }) } ) ] ] #slide(title: [Classification])[ #v(2em) Accuracy on test sets: / ` large`: \~40.40% (random choice \~0.44%) / ` small`: \~86.19% (random choice \~7.69%) / `binary`: \~71.19% (random choice 50%) ] #slide(title: [Demo], new-section: [])[ #pdfpc.speaker-note("Since I've trained the model on my laptop, I used a smaller network.") #text(size: 0.66em)[ ```sh ./target/release/varys -i ap1 analyse demo data/ml/test_5_13\ queries_0.86 f4:34:f0:89:2d:75 ``` ] #quote[Hey Siri, any missed calls?] #quote[Hey Siri, remind me to wash the car.] #v(1em) #text(size: 0.66em)[It is unlikely this will work...] ] #slide(header: pad(left: 1em, top: 0.5em, heading[Timeline References]))[ #text(size: 0.7em)[ - D. Wagner and B. Schneier, #quote[Analysis of the SSL 3.0 Protocol], November 1996, Available: #link("https://dl.acm.org/doi/10.5555/1267167.1267171") - H. Cheng and R. Avnur, #quote[Traffic Analysis of SSL Encrypted Web Browsing], 1998 - K. Abe and S. Goto, #quote[Fingerprinting Attack on Tor Anonymity using Deep Learning], August 2016, Available: #link("https://core.ac.uk/display/229876143") - S. Kennedy et al., #quote[I Can Hear Your Alexa: Voice Command Fingerprinting on Smart Home Speakers], June 2019, Available: #link("https://doi.org/10.1109/CNS.2019.8802686") - Chenggang Wang et al., #quote[Fingerprinting Encrypted Voice Traffic on Smart Speakers with Deep Learning], May 2020, Available: #link("https://doi.org/10.1145/3395351.3399357") - Jianghan Mao et al., #quote[A novel model for voice command fingerprinting using deep learning], March 2022, Available: #link("https://doi.org/10.1016/j.jisa.2021.103085") - D. Ahmed, A. Sabir, and A. Das, #quote[Spying through Your Voice Assistants: Realistic Voice Command Fingerprinting], August 2023, Available: #link("https://www.usenix.org/conference/usenixsecurity23/presentation/ahmed-dilawer") ] ]