Testado com Scala 2.11.11, sbt 0.13.15 e Spark 2.1.0
build.sbt
realizou as seguintes coisas:
* execute a sbt run
partir do shell ou terminal
sbt * Use o plugin sbt-assembly para empacotar adequadamente o jar para uso com spark-submit
uma instância spark em execução dentro do contêiner docker
name := "spark-mllib-test"
version := "1.0"
scalaVersion := "2.11.11"
val sparkVersion = "2.1.0"
libraryDependencies ++= Seq (
"org.apache.spark" %% "spark-core" % sparkVersion % "provided", // spark runtime already provides jars
"org.apache.spark" %% "spark-streaming" % sparkVersion % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
// not relevant, just allows me to pass command line options to spark job
"args4j" % "args4j" % "2.33",
"com.bizo" % "args4j-helpers_2.10" % "1.0.0"
)
/* without this explicit merge strategy code you get a lot of noise from sbt-assembly
complaining about not being able to dedup files */
assemblyMergeStrategy in assembly := {
case PathList("org","aopalliance", xs @ _*) => MergeStrategy.last
case PathList("javax", "inject", xs @ _*) => MergeStrategy.last
case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
case PathList("org", "apache", xs @ _*) => MergeStrategy.last
case PathList("com", "google", xs @ _*) => MergeStrategy.last
case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
case "about.html" => MergeStrategy.rename
case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
case "META-INF/mailcap" => MergeStrategy.last
case "META-INF/mimetypes.default" => MergeStrategy.last
case "plugin.properties" => MergeStrategy.last
case "log4j.properties" => MergeStrategy.last
case "overview.html" => MergeStrategy.last // Added this for 2.1.0 I think
case x =>
val oldStrategy = (assemblyMergeStrategy in assembly).value
oldStrategy(x)
}
/* including scala bloats your assembly jar unnecessarily, and may interfere with
spark runtime */
assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
assemblyJarName in assembly := "spark-mllib-test.jar"
/* you need to be able to undo the "provided" annotation on the deps when running your spark
programs locally i.e. from sbt; this bit reincludes the full classpaths in the compile and run tasks. */
fullClasspath in Runtime := (fullClasspath in (Compile, run)).value