File Corruption Example with Append

Example of writing to file from multiple threads that corrupts the content, even when we append.

Testing with SIMPLE_APPEND - Corruption started past 16K characters (presuming about 16KB)

  • write_type=[SIMPLE_APPEND] characters_per_write=[1000] corruption_rate=0.00% co-routines=[2]
  • write_type=[SIMPLE_APPEND] characters_per_write=[2000] corruption_rate=0.00% co-routines=[2]
  • ...
  • write_type=[SIMPLE_APPEND] characters_per_write=[16000] corruption_rate=0.00% co-routines=[2]
  • write_type=[SIMPLE_APPEND] characters_per_write=[17000] corruption_rate=26.50% co-routines=[2]
  • write_type=[SIMPLE_APPEND] characters_per_write=[18000] corruption_rate=42.00% co-routines=[2]
  • write_type=[SIMPLE_APPEND] characters_per_write=[19000] corruption_rate=19.50% co-routines=[2]
  • write_type=[SIMPLE_APPEND] characters_per_write=[20000] corruption_rate=33.00% co-routines=[2]

Testing with BUFFERED_WRITER - Corruption started past 8K characters (presuming around 8KB)

  • write_type=[BUFFERED_WRITER] characters_per_write=[1000] corruption_rate=0.00% co-routines=[2]
  • write_type=[BUFFERED_WRITER] characters_per_write=[2000] corruption_rate=0.00% co-routines=[2]
  • ...
  • write_type=[BUFFERED_WRITER] characters_per_write=[8000] corruption_rate=0.00% co-routines=[2]
  • write_type=[BUFFERED_WRITER] characters_per_write=[9000] corruption_rate=29.50% co-routines=[2]
  • write_type=[BUFFERED_WRITER] characters_per_write=[10000] corruption_rate=38.50% co-routines=[2]

Reproducible snapshot

GT-Sandbox-Snapshot

Code

package com.glassthought.sandbox

import kotlinx.coroutines.*
import java.io.File
import java.io.FileWriter
import java.io.BufferedWriter
import kotlin.system.measureTimeMillis

enum class FileWriteType {
  SIMPLE_APPEND,
  BUFFERED_WRITER
}

fun main(args: Array<String>) = runBlocking {
  println("#### Testing with SIMPLE_APPEND:")
  for (i in 1..20) {
    go(
      TestOptions(
        charsPerWrite = i * 1_000,
        writeType = FileWriteType.SIMPLE_APPEND
      )
    ).getOneLinerReport().print()
  }

  println("\n#### Testing with BUFFERED_WRITER:")
  for (i in 1..20) {
    go(
      TestOptions(
        charsPerWrite = i * 1_000,
        writeType = FileWriteType.BUFFERED_WRITER
      )
    ).getOneLinerReport().print()
  }
}

data class TestOptions(
  val numCoroutines: Int = 2,
  val charsPerWrite: Int,
  val iterationsPerCoroutine: Int = 100,
  val writeType: FileWriteType = FileWriteType.SIMPLE_APPEND
)

data class CorruptedLine(
  val lineNumber: Int,
  val line: String,
  val sample: String,
  val charDistribution: List<Pair<Char, Int>>,
  val lineLength: Int
)

interface Report {
  fun print()
}

class OneLinerReport(private val result: TestResult) : Report {
  override fun print() = println(toString())

  override fun toString(): String = buildString {
    val corruptionRateStr = "%.2f%%".format(result.corruptionRate)
    return "- write_type=[${result.options.writeType}] characters_per_write=[${result.options.charsPerWrite}] corruption_rate=${corruptionRateStr} co-routines=[${result.options.numCoroutines}]"
  }
}

class ConciseReport(private val result: TestResult) : Report {
  override fun print() = println(toString())

  override fun toString(): String = buildString {
    appendLine("### CONCISE REPORT characters per write [${result.options.charsPerWrite}]")
    appendLine("Test Parameters:")
    appendLine("  - Write Type: ${result.options.writeType}")
    appendLine("  - Coroutines: ${result.options.numCoroutines}")
    appendLine("  - Characters per write: ${result.options.charsPerWrite}")
    appendLine("  - Iterations per coroutine: ${result.options.iterationsPerCoroutine}")
    appendLine("\nResults:")
    appendLine("  - Execution time: ${result.executionTimeMs}ms")
    appendLine("  - Total lines: ${result.totalLines} (expected: ${result.expectedLines})")
    appendLine("  - Corrupted lines: ${result.corruptedLines.size}")
    appendLine("  - Corruption rate: %.2f%%".format(result.corruptionRate))
    appendLine("  - File: ${result.tmpFile.name}")
  }
}

class FullReport(private val result: TestResult, private val maxSamples: Int = 3) : Report {
  override fun print() = println(toString())

  override fun toString(): String = buildString {
    appendLine("=== FULL REPORT ===")
    appendLine("Test Parameters:")
    appendLine("  - Write Type: ${result.options.writeType}")
    appendLine("  - Coroutines: ${result.options.numCoroutines}")
    appendLine("  - Characters per write: ${result.options.charsPerWrite}")
    appendLine("  - Iterations per coroutine: ${result.options.iterationsPerCoroutine}")
    appendLine("  - Letters used: ${('A'..<('A' + result.options.numCoroutines)).joinToString()}")

    appendLine("\nExecution:")
    appendLine("  - Time: ${result.executionTimeMs}ms")
    appendLine("  - File: ${result.tmpFile.absolutePath}")

    appendLine("\nCorruption Analysis:")
    appendLine("  - Total lines written: ${result.totalLines}")
    appendLine("  - Expected lines: ${result.expectedLines}")
    appendLine("  - Corrupted lines found: ${result.corruptedLines.size}")
    appendLine("  - Corruption rate: %.2f%%".format(result.corruptionRate))

    if (result.corruptedLines.isNotEmpty()) {
      appendLine(
        "\nCorrupted Line Samples (showing ${
          minOf(
            maxSamples,
            result.corruptedLines.size
          )
        } of ${result.corruptedLines.size}):"
      )
      result.corruptedLines.take(maxSamples).forEach { corrupted ->
        appendLine("\n  Line ${corrupted.lineNumber}:")
        appendLine("    Sample: ${corrupted.sample}")
        appendLine("    Character distribution (top 5): ${corrupted.charDistribution}")
        appendLine("    Line length: ${corrupted.lineLength} (expected: ${result.options.charsPerWrite})")
      }

      if (result.corruptedLines.size > maxSamples) {
        appendLine("\n  ... and ${result.corruptedLines.size - maxSamples} more corrupted lines")
      }
    } else {
      appendLine("\n  No corruption detected!")
      appendLine("  Try increasing coroutines/iterations or running multiple times.")
    }

    appendLine("\nFile Statistics:")
    appendLine("  - File size: ${result.fileSize} bytes")
    appendLine("  - Expected size: ~${result.expectedSize} bytes")
    appendLine("  - Size difference: ${result.fileSize - result.expectedSize} bytes")
  }
}

data class TestResult(
  val options: TestOptions,
  val tmpFile: File,
  val executionTimeMs: Long,
  val totalLines: Int,
  val expectedLines: Int,
  val corruptedLines: List<CorruptedLine>,
  val fileSize: Long,
  val expectedSize: Long
) {
  val corruptionRate: Double
    get() = if (totalLines > 0) {
      corruptedLines.size.toDouble() / totalLines * 100
    } else 0.0

  fun getConciseReport(): Report = ConciseReport(this)
  fun getOneLinerReport(): Report = OneLinerReport(this)
  fun getFullReport(maxSamples: Int = 3): Report = FullReport(this, maxSamples)

  fun cleanUp() {
    tmpFile.delete()
  }
}

suspend fun go(options: TestOptions): TestResult {
  val tmpFile = File.createTempFile("corruption_test_", ".txt")
  val letters = ('A'..'Z').toList()
  val selectedLetters = letters.take(options.numCoroutines)

  val executionTime = measureTimeMillis {
    coroutineScope {
      val jobs = selectedLetters.map { letter ->
        launch(Dispatchers.IO) {
          repeat(options.iterationsPerCoroutine) {
            when (options.writeType) {
              FileWriteType.SIMPLE_APPEND -> {
                val stringToAppend = letter.toString().repeat(options.charsPerWrite) + "\n"
                tmpFile.appendText(stringToAppend)
              }

              FileWriteType.BUFFERED_WRITER -> {
                BufferedWriter(FileWriter(tmpFile, true)).use { writer ->
                  val line = letter.toString().repeat(options.charsPerWrite)
                  writer.write(line)
                  writer.newLine()
                }
              }
            }
          }
        }
      }
      jobs.joinAll()
    }
  }

  // Gather corruption data
  var totalLines = 0
  val corruptedLinesList = mutableListOf<CorruptedLine>()

  tmpFile.bufferedReader().use { reader ->
    reader.lineSequence().forEachIndexed { lineNumber, line ->
      totalLines++
      if (line.isNotEmpty()) {
        val firstChar = line[0]
        val isCorrupted = !line.all { it == firstChar }

        if (isCorrupted) {
          val sample = if (line.length > 100) {
            line.substring(0, 100) + "..."
          } else {
            line
          }

          val charCounts = line.groupBy { it }
            .mapValues { it.value.size }
            .toList()
            .sortedByDescending { it.second }
            .take(5)

          corruptedLinesList.add(
            CorruptedLine(
              lineNumber = lineNumber + 1,
              line = line,
              sample = sample,
              charDistribution = charCounts,
              lineLength = line.length
            )
          )
        }
      }
    }
  }

  val expectedLines = options.numCoroutines * options.iterationsPerCoroutine
  val fileSize = tmpFile.length()
  val expectedSize = options.numCoroutines.toLong() * options.iterationsPerCoroutine * (options.charsPerWrite + 1)

  return TestResult(
    options = options,
    tmpFile = tmpFile,
    executionTimeMs = executionTime,
    totalLines = totalLines,
    expectedLines = expectedLines,
    corruptedLines = corruptedLinesList,
    fileSize = fileSize,
    expectedSize = expectedSize
  )
}

Command to reproduce:

gt.sandbox.checkout.commit b54c3189a9b0ede67e37 \
&& cd "${GT_SANDBOX_REPO}" \
&& cmd.run.announce "./gradlew run --quiet"

Recorded output of command:

Picked up JAVA_TOOL_OPTIONS: -Dkotlinx.coroutines.debug
Picked up JAVA_TOOL_OPTIONS: -Dkotlinx.coroutines.debug
#### Testing with SIMPLE_APPEND:
- write_type=[SIMPLE_APPEND] characters_per_write=[1000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[2000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[3000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[4000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[5000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[6000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[7000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[8000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[9000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[10000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[11000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[12000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[13000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[14000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[15000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[16000] corruption_rate=0.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[17000] corruption_rate=26.50% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[18000] corruption_rate=42.00% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[19000] corruption_rate=19.50% co-routines=[2]
- write_type=[SIMPLE_APPEND] characters_per_write=[20000] corruption_rate=33.00% co-routines=[2]

#### Testing with BUFFERED_WRITER:
- write_type=[BUFFERED_WRITER] characters_per_write=[1000] corruption_rate=0.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[2000] corruption_rate=0.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[3000] corruption_rate=0.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[4000] corruption_rate=0.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[5000] corruption_rate=0.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[6000] corruption_rate=0.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[7000] corruption_rate=0.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[8000] corruption_rate=0.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[9000] corruption_rate=29.50% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[10000] corruption_rate=38.50% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[11000] corruption_rate=29.50% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[12000] corruption_rate=34.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[13000] corruption_rate=38.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[14000] corruption_rate=32.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[15000] corruption_rate=28.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[16000] corruption_rate=10.50% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[17000] corruption_rate=49.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[18000] corruption_rate=49.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[19000] corruption_rate=49.00% co-routines=[2]
- write_type=[BUFFERED_WRITER] characters_per_write=[20000] corruption_rate=50.00% co-routines=[2]