presentation

2024-04-11 13:05:55 +02:00
commit af5b886b90
25 changed files with 991 additions and 0 deletions
--- a/main.typ
+++ b/main.typ
@@ -0,0 +1,549 @@
+#import "@preview/polylux:0.3.1": *
+#import "university-custom.typ": *
+#import "@preview/fletcher:0.4.1" as fletcher: node, edge
+#import "@preview/cetz:0.2.0"
+
+#let mint = rgb("a5d7d2")
+#let mint-hell = rgb("d2ebe9")
+#let rot = rgb("d20537")
+#let anthrazit = rgb("2d373c")
+#let anthrazit-hell = rgb("46505a")
+
+#show quote: it => {
+  if it.block {
+    it
+  } else {
+    emph(it)
+  }
+}
+
+// diagrams
+#let edge_(..args) = edge(..args, marks: (none, "|>")) // we can't use set rules for user defined functions yet, so we overwrite it
+#let group_node(colour, ..args) = node(
+  ..args,
+  stroke: colour,
+  fill: colour.lighten(80%),
+)
+#let group_edge(colour, description, ..args) = edge(
+  ..args,
+  text(colour)[#description],
+  "..",
+  stroke: colour,
+  label-side: left,
+)
+#let enclose_nodes(nodes, colour, clearance: (8pt, 8pt, 8pt, 8pt)) = {
+  let (center, size) = fletcher.bounding-rect(nodes.map(node => node.real-pos))
+  center.at(0) = center.at(0) - (clearance.at(3) - clearance.at(1))
+  center.at(1) = center.at(1) - (clearance.at(2) - clearance.at(0))
+  
+  cetz.draw.content(
+    center,
+    rect(
+      width: size.at(0) + clearance.at(1) + clearance.at(3),
+      height: size.at(1) + clearance.at(2) + clearance.at(0),
+      radius: 16pt,
+      stroke: colour,
+      fill: colour.lighten(85%),
+    )
+  )
+}
+
+// functions
+#let focus-title-slide(title) = {
+  focus-slide(background-color: white)[
+    #align(center, text(fill: anthrazit, size: 0.7em, weight: "bold", title))
+  ]
+}
+
+// #pdfpc.config(
+//   duration-minutes: 30,
+//   // start-time: "10:15",
+//   // end-time: "10:45",
+//   // note-font-size: 24,
+// )
+
+#show: university-theme.with(
+  short-title: [A Testbed for Voice Assistant Traffic Fingerprinting],
+  short-author: [Milan van Zanten],
+  color-a: anthrazit,
+  color-b: mint,
+)
+
+#title-slide(
+  title: [A Testbed for Voice Assistant \ Traffic Fingerprinting],
+  subtitle: [Master Thesis Presentation],
+  authors: [Milan van Zanten],
+  date: [21.03.2024],
+  institution-name: [University of Basel],
+  logo: pad(1em, image("unibas-logo.svg"))
+)
+
+#slide(title: [Outline])[
+  + Voice Assistants
+  + Traffic Fingerprinting
+  + Testbed
+  + Results
+  + Demo
+
+  Ask questions any time!
+]
+
+#focus-title-slide[
+  A Testbed for #text(fill: rot)[Voice Assistant] \ Traffic Fingerprinting
+]
+
+#slide(title: [Devices], new-section: [Voice Assistants])[
+  #pdfpc.speaker-note("Three main VAs")
+  
+  Specifically, #emph[Smart Speakers]
+  #only(1)[
+    #side-by-side[
+      #align(center)[
+        #image(width: 50%, "images/alexa.jpg")
+  
+        Echo Dot
+        
+        #emph[Amazon Alexa]
+      ]
+    ][
+      #align(center)[
+        #image(width: 50%, "images/siri.jpg")
+  
+        HomePod Mini
+        
+        #emph[Siri]
+      ]
+    ][
+      #align(center)[
+        #image(width: 50%, "images/google-assistant.jpg")
+  
+        Google Home Mini
+        
+        #emph[Google Assistant]
+      ]
+    ]
+  ]
+  #only(2)[
+    #side-by-side[
+      #align(center)[
+        #image(width: 40%, "images/alexa.jpg")
+
+        #text(size: .6em)[
+          Echo Dot
+          
+          #emph[Amazon Alexa]
+        ]
+      ]
+    ][
+      #align(center)[
+        #image(width: 80%, "images/siri.jpg")
+  
+        HomePod Mini
+        
+        #emph[Siri]
+      ]
+    ][
+      #align(center)[
+        #image(width: 40%, "images/google-assistant.jpg")
+  
+        #text(size: .6em)[
+          Google Home Mini
+          
+          #emph[Google Assistant]
+        ]
+      ]
+    ]
+  ]
+]
+
+#slide(title: [Smart Speaker Privacy / Security])[
+  #counter(footnote).update(0) // reset footnote counter
+  
+  #pdfpc.speaker-note("Why do we want to look at smart speakers?")
+  #pdfpc.speaker-note("Alexa; guest voice wrongly recognised 65%")
+  
+  #alternatives(repeat-last: true)[
+    none #h(1em) `(╯°□°）╯︵ ┻━┻`
+  ][
+    There are concerns...
+  ]
+  #pause
+
+  - Usually located where sensitive conversations take place
+  - Necessarily always listening
+    - Misactivations
+  - Used to control smart home devices (e.g. door locks)
+  - No authentication\*
+
+  #uncover(3)[About 40% of households in the U.S. own a smart speaker.]
+
+  #v(1em)
+  #text(0.6em)[\* Voice recognition is still insecure.]
+]
+
+#slide(title: [Attacks on Smart Speakers])[
+  #pdfpc.speaker-note("skill squatting; specific to alexa, skills basically apps")
+  #pdfpc.speaker-note("Boil an egg is an existing skill")
+
+  Active:
+  - Malicious activations
+  - Similar pronounciations, "skill squatting"
+    - (e.g. "Boil an egg" $->$ "Boyle an egg")#footnote[D. Kumar et al., #quote[Skill Squatting Attacks on Amazon Alexa], August 2018, Available: #link("https://www.usenix.org/conference/usenixsecurity18/presentation/kumar")]
+
+  Passive:
+  - #alternatives[Traffic Fingerprinting][#text(fill: rot)[Traffic Fingerprinting]]
+]
+
+#focus-title-slide[
+  A Testbed for Voice Assistant \ #text(fill: rot)[Traffic Fingerprinting]
+]
+
+#slide(title: [], new-section: [Traffic Fingerprinting])[
+  #pdfpc.speaker-note("Just after the release of SSL 3.0")
+  #pdfpc.speaker-note("David Wagner and Bruce Schneier")
+  #pdfpc.speaker-note("Probably one of the first mentions")
+
+  #v(2em)
+  
+  #quote(attribution: [Wagner and Schneier#footnote[D. Wagner and B. Schneier, #quote[Analysis of the SSL 3.0 Protocol], November 1996, Available: #link("https://dl.acm.org/doi/10.5555/1267167.1267171")]], block: true, quotes: true)[[SSL] traffic analysis aims to recover confidential information about protection sessions by examining unencrypted packet fields and #alternatives(repeat-last: true)[unprotected packet attributes][#text(fill: rot)[unprotected packet attributes]]. For example [...] the volume of network traffic flow]
+
+  #uncover(3)[... packet direction, timing, and more]
+]
+
+#slide(title: [Timeline])[
+  #counter(footnote).update(0) // reset footnote counter
+
+  #pdfpc.speaker-note("Relevant developments in my opinion")
+  #pdfpc.speaker-note("Abe and Goto; Denoising Autoencoder on Tor traffic")
+  #pdfpc.speaker-note("Mao et al.; time between packets")
+  #pdfpc.speaker-note("Ahmed, Sabir and Das; so far all attacks assumed known traffic window, allowed them to do end-to-end")
+  
+  / 1996: Wagner and Schneier#footnote[Timeline references can be found at the end of the presentation.], #text(fill: rot)[coined SSL traffic analysis]
+  / 1998: Cheng and Avnur, #text(fill: rot)[website traffic analysis]
+  
+  #h(4em) #text(fill: rot)[website fingerprinting (WF)...]
+  
+  / 2016: Abe and Goto, #text(fill: rot)[deep learning WF]
+  / 2019: Kennedy et al., #text(fill: rot)[apply WF techniques to voice assistants (VA)]
+  / 2020: Wang et al., #text(fill: rot)[deep learning VA fingerprinting]
+  / 2022: Mao et al., #text(fill: rot)[temporal features]
+  / 2023: Ahmed, Sabir and Das, #text(fill: rot)[invocation detection]
+]
+
+#slide(title: [Threat Model])[
+  #pdfpc.speaker-note("2. to filter traffic")
+  
+  #align(center, text(size: 0.7em,
+    fletcher.diagram(
+      node-stroke: 1pt,
+      edge-stroke: 1pt,
+      node-corner-radius: 8pt,
+      edge-corner-radius: 8pt,
+      node-fill: rgb("eee"),
+      spacing: 3em,
+      node((-2, 0), `VA Server`),
+        edge(`WAN`),
+      node((0, 0), `Gateway`),
+        edge("rr", `LAN`),
+        edge("r", "d", "r"),
+      node((2, 0), `Smart Speaker`),
+      node((2, 1), `Other Devices`),
+      group_node(rot, (1, -1), `Attacker`),
+        edge((1, -0.4), text(fill: rot)[`Intercept`], stroke: rot),
+    )
+  ))
+
+  + The attacker can intercept traffic from smart speaker
+  + The attacker knows the smart speaker address
+  + The attacker knows the type of smart speaker used
+  + The attacker knows the beginning and end of an interaction
+]
+
+#slide(title: [Closed-World])[
+  #v(2em)
+  
+  - Fixed list of monitored voice commands
+  - Traffic is considered to come from one of the monitored commands
+  - Multiclass classification
+
+  Predicts which command was used.
+]
+
+#slide(title: [Open-World])[
+  #v(2em)
+  
+  - Traffic can also come from new, unmonitored commands
+  - Binary-classification
+
+  Predicts whether traffic is from monitored or unmonitored command.
+]
+
+#slide(title: [Combining Both Models])[
+  #v(2em)
+  
+  #align(center, text(size: 0.7em,
+    fletcher.diagram(
+      node-stroke: 1pt,
+      edge-stroke: 1pt,
+      node-corner-radius: 8pt,
+      edge-corner-radius: 8pt,
+      node-fill: rgb("eee"),
+      spacing: 3em,
+      node((0, 0), `traffic`),
+        edge_(),
+      node((1, 0), `open-world classification`),
+        edge_("d", `unmonitored`),
+        edge_(`monitored`),
+      node((3, 0), `closed-world classification`),
+        edge_(),
+      node((3, 1), `prediction`),
+      node((1, 1), `unknown`),
+    )
+  ))
+]
+
+#focus-title-slide[
+  A #text(fill: rot)[Testbed] for Voice Assistant \ Traffic Fingerprinting
+]
+
+#slide(title: [Comparison], new-section: [Testbed])[
+  #pdfpc.speaker-note("Why? Let's look at Website Traffic Fingerprinting")
+
+  #side-by-side[
+    Website Fingerprinting:
+    - Requires a large amount of data
+    - Data collection usually via program making requests
+    - Only dependent on network environment
+    - Fast
+  ][
+    Voice Command Fingerprinting: #pause
+    - Requires a large amount of data #pause
+    - Interaction by speaking \ ~ #pause
+    - Hampered by environment noise \ ~ #pause
+    - Slow and inefficient #pause
+    #text(fill: rot)[$->$ Sophisticated testbed]
+  ]
+]
+
+#slide(title: [])[
+  #counter(footnote).update(0) // reset footnote counter
+  
+  #pdfpc.speaker-note("Found in one of the newer papers")
+
+  #v(4em)
+  
+  #quote(attribution: [Mao et al.#footnote[Jianghan Mao et al., #quote[A novel model for voice command fingerprinting using deep learning], March 2022, Available: #link("https://doi.org/10.1016/j.jisa.2021.103085")]], block: true, quotes: true)[The content of voice commands may vary from date to date; therefore, more efficient data collection tools need to be developed.]
+]
+
+#slide(title: [Requirements])[
+  - Sound isolation
+    - Isolated box
+    - Separate speaker/microphone
+  - Efficiency
+    - Every second saved per interaction means hours saved when collecting dozens of thousands interactions
+    - Dynamic interaction length by listening for silence
+  - Robustness
+    - Autonomously reset VA if error occurs
+    - Monitoring system
+]
+
+#slide(title: [System])[
+  #text(size: 0.8em)[
+    / `varys`: The main executable combining all modules into the final system.
+    / `varys-analysis`: Analysis of data collected by varys.
+    / `varys-audio`: Recording audio and the TTS and STT systems.
+    / `varys-database`: Abstraction of the database system where interactions are stored.
+    / `varys-network`: Collection of network traffic, writing and parsing of `.pcap` files.
+  ]
+
+  #v(1em)
+
+  #align(center, text(size: 0.7em)[
+    #fletcher.diagram(
+      node-stroke: 1pt,
+      edge-stroke: 1pt,
+      node-corner-radius: 4pt,
+      edge-corner-radius: 4pt,
+      spacing: 1.5em,
+      node((0, 0), `varys`, fill: rgb("eee")),
+        edge_("r"),
+        edge_("dd", "l", "d"),
+        edge_("ddd"),
+        edge_("dd", "r", "d"),
+      group_node(anthrazit, (1, 0), "varys-analysis"),
+        edge_("d", (0.12, 1), (0.12, 2.625)),
+        edge_("d", (1.2, 1), (1.2, 2.625)),
+      group_node(anthrazit, (-1, 3), "varys-audio"),
+      group_node(anthrazit, (0, 3), "varys-network"),
+      group_node(anthrazit, (1, 3), "varys-database"),
+    )
+  ])
+]
+
+#focus-title-slide[
+  Results
+]
+
+#slide(title: [Datasets], new-section: [Results])[
+  #v(2em)
+  
+  \~800h, \~70'000 interactions
+
+  ~
+  
+  / ` large`: 227 queries, 140 interactions each
+  / ` small`: 13 queries, 2400 interactions each
+  / `binary`: #quote[Call John Doe] and #quote[Call Mary Poppins], 1500 interactions each
+]
+
+#slide(title: [Efficiency])[
+  #pdfpc.speaker-note("Some outliers like \"tell me a story\" at >1min")
+  
+  #align(center, text(size: 0.8em)[
+    #cetz.canvas({
+      import cetz.draw: *
+      import cetz.plot
+      
+      let data = csv("csv/aggregate_average_duration.csv").map(item => {
+        (float(item.at(0)), int(item.at(1)))
+      })
+      let data_remaining = csv("csv/aggregate_average_remaining_duration.csv").map(item => {
+        (float(item.at(0)), int(item.at(1)))
+      })
+  
+      set-style(legend: (padding: 5pt, item: (spacing: 10pt)))
+      plot.plot(
+        size: (24, 8),
+        axis-style: "scientific-auto",
+        legend: "legend.inner-north",
+        x-label: "average duration [s]",
+        x-tick-step: 2,
+        x-min: 0,
+        x-max: 36,
+        y-label: "queries",
+        y-tick-step: 1,
+        y-min: 0,
+      {
+        for item in data {
+          plot.add(
+            ((item.at(0), 0), (..item)),
+            style: (stroke: 2pt + mint),
+          )
+        }
+        plot.add(
+          ((100, 0), (100, 1)),
+          style: (stroke: 8pt + mint),
+          label: "Avg. Speaking Duration",
+        )
+        
+        for item in data_remaining {
+          plot.add(
+            ((item.at(0), 0), (..item)),
+            style: (stroke: 2pt + rot),
+          )
+        }
+        plot.add(
+          ((100, 0), (100, 1)),
+          style: (stroke: 8pt + rot),
+          label: "Avg. Remaining Duration",
+        )
+      })
+    })
+  ])
+]
+
+#slide(title: [Traffic Trace Examples])[
+  #side-by-side[
+    #image(width: 100%, height: 89%, "images/plots/plot-Hey Siri. Any missed calls.png")
+    #v(10pt, weak: true)
+    #quote[Any missed calls?]
+  ][
+    #image(width: 100%, height: 89%, "images/plots/plot-Hey Siri. What day was 90 days ago.png")
+    #v(10pt, weak: true)
+    #quote[What day was 90 days ago?]
+  ]
+]
+
+#slide(title: [Fingerprinting Model])[
+  #counter(footnote).update(0) // reset footnote counter
+
+  #text(size: 0.96em)[
+    Feature extraction #text(size: 0.8em)[(packet size $s in [0, 1500]$ and direction $d in {0, 1}$)]:
+    $$$
+    (s, d) -> (-1)^(d) dot s/1500 #h(0.8em)
+    $$$
+    
+    CNN adapted from Wang et al.#footnote[Chenggang Wang et al., #quote[Fingerprinting Encrypted Voice Traffic on Smart Speakers with Deep Learning], May 2020, Available: #link("https://doi.org/10.1145/3395351.3399357")]:
+  ]
+  
+  #text(size: 0.65em)[
+    #fletcher.diagram(
+      node-stroke: 1pt,
+      node-fill: rgb("eee"),
+      edge-stroke: 1pt,
+      node-corner-radius: 4pt,
+      edge-corner-radius: 4pt,
+      spacing: 1.5em,
+      group_node(rot, (0, 0), align(center)[Input]),
+        edge_(),
+      node((1, 0), align(center)[Conv. Layer \ `[tanh]`]),
+        edge_(),
+      node((2, 0), align(center)[Dropout \ $0.1$]),
+        edge_(),
+      node((3, 0), align(center)[Global Average \ Pooling]),
+        edge_(),
+      node((4, 0), align(center)[Dense Layer \ `[elu]`]),
+        edge_(),
+      node((5, 0), align(center)[Dense Layer \ `[softmax]`]),
+        edge_(),
+      group_node(rot, (6, 0), align(center)[Output]),
+      render: (grid, nodes, edges, options) => {
+        let cnn_1 = (nodes.at(1), nodes.at(2), nodes.at(3))
+        cetz.canvas({
+          enclose_nodes(cnn_1, rgb(0, 0, 0, 50%), clearance: (34pt, 64pt, 34pt, 61pt))
+          fletcher.draw-diagram(grid, nodes, edges, options)
+        })
+      }
+    )
+  ]
+]
+
+#slide(title: [Classification])[
+  #v(2em)
+  
+  Accuracy on test sets:
+  
+  / ` large`: \~40.40% (random choice \~0.44%)
+  / ` small`: \~86.19% (random choice \~7.69%)
+  / `binary`: \~71.19% (random choice 50%)
+]
+
+#slide(title: [Demo], new-section: [])[
+  #pdfpc.speaker-note("Since I've trained the model on my laptop, I used a smaller network.")
+  
+  #text(size: 0.66em)[
+    ```sh
+    ./target/release/varys -i ap1 analyse demo data/ml/test_5_13\ queries_0.86 f4:34:f0:89:2d:75
+    ```
+  ]
+  
+  #quote[Hey Siri, any missed calls?]
+  
+  #quote[Hey Siri, remind me to wash the car.]
+
+  #v(1em)
+  #text(size: 0.66em)[It is unlikely this will work...]
+]
+
+#slide(header: pad(left: 1em, top: 0.5em, heading[Timeline References]))[
+  <references>
+  #text(size: 0.7em)[
+    - D. Wagner and B. Schneier, #quote[Analysis of the SSL 3.0 Protocol], November 1996, Available: #link("https://dl.acm.org/doi/10.5555/1267167.1267171")
+    - H. Cheng and R. Avnur, #quote[Traffic Analysis of SSL Encrypted Web Browsing], 1998
+    - K. Abe and S. Goto, #quote[Fingerprinting Attack on Tor Anonymity using Deep Learning], August 2016, Available: #link("https://core.ac.uk/display/229876143")
+    - S. Kennedy et al., #quote[I Can Hear Your Alexa: Voice Command Fingerprinting on Smart Home Speakers], June 2019, Available: #link("https://doi.org/10.1109/CNS.2019.8802686")
+    - Chenggang Wang et al., #quote[Fingerprinting Encrypted Voice Traffic on Smart Speakers with Deep Learning], May 2020, Available: #link("https://doi.org/10.1145/3395351.3399357")
+    - Jianghan Mao et al., #quote[A novel model for voice command fingerprinting using deep learning], March 2022, Available: #link("https://doi.org/10.1016/j.jisa.2021.103085")
+    - D. Ahmed, A. Sabir, and A. Das, #quote[Spying through Your Voice Assistants: Realistic Voice Command Fingerprinting], August 2023, Available: #link("https://www.usenix.org/conference/usenixsecurity23/presentation/ahmed-dilawer")
+  ]
+]