\documentclass[12pt]{article}
\usepackage{graphicx}
\usepackage{amsmath,amssymb,url}
\usepackage{bbm}
\usepackage[boxruled,linesnumbered]{algorithm2e}
\usepackage{courier}
\usepackage{hyperref}
\newfont{\bssdoz}{cmssbx10 scaled 1200}
\newfont{\bssten}{cmssbx10}
\oddsidemargin 0in \evensidemargin 0in \topmargin -0.5in
\headheight 0.25in \headsep 0.25in
\textwidth 6.5in \textheight 9in %\marginparsep 0pt \marginparwidth 0pt
\parskip 1.5ex \parindent 0ex \footskip 20pt
\begin{document}
\parindent 0ex
\thispagestyle{empty} \vspace*{-0.75in}
{\bssdoz CME 323: Distributed Algorithms and Optimization}
{\\ \bssten Instructor: Reza Zadeh (rezab@stanford.edu) }
{\\ \bssten HW\#3 -- Due at the beginning of class Tuesday May 22}
\vspace{5pt}
\begin{enumerate}
\item \textbf{Intro to Spark} Download the following materials.
\begin{itemize}
\item \href{https://stanford.edu/~rezab/classes/cme323/S15/slides/itas_workshop.pdf}{Slides}
\item \href{http://training.databricks.com/workshop/usb.zip}{Spark and Data} \end{itemize}
Now, answer the following questions.
\begin{enumerate} \item[(a)] Checkpoint on slide 11.
\item[(b)] Checkpoint on slide 55.
\item[(c)] Checkpoint on slide 60. Note: slide 59 references the file \texttt{CONTRIBUTING.md}
which is in the provided zip file. Instead, use the file
\texttt{website/getting-started.md} \end{enumerate}
Submit your code and answers.
\item \textbf{Intro to Map Reduce}
Warmup question. Assume you are given a typical MapReduce implementation where you only have to write the Map and Reduce functions. The Map function you will write takes as input a (key, value) record and returns either a (key, value) record or nothing. The Reduce function you will write takes as input (key, list of all values for that key) and returns either a record or nothing. The framework already takes care of iterating the Map function over all the records in the input file, key-based intermediate data transfer between Map and Reduce, and storing the returned value of Reduce you do not have to worry about these. You are now given an input file which contains comprehensive information about a social network that has asymmetrical (directed) links, i.e., a network where users follow other users but not necessarily vice-versa (e.g., Twitter). Each record in this input file is (userid-a, userid-b), where userid-a follows userid-b (i.e., points to it). Note that this record tells you nothing about whether or not userid-b follows userid-a. Write a MapReduce program (i.e., Map function and Reduce function) that outputs all pairs of userids who follow each other. Pseudocode is OK.
\item \textbf{Word Count Shuffle} Warmup question. Consider counting the number of
occurrences of words in a collection of documents, where there are only k possible words. Write a MapReduce to achieve this, and analyze the shuffle size with and without combiners being used (assuming B mappers are used).
\item \textbf{Prefix Sum}
The \emph{prefix-sum} operator takes an array $a_1, \ldots, a_n$ and returns
an array $s_1, \ldots, s_n$ where $s_i = \sum_{j \leq i} a_j$. For example, starting
with an array \texttt{[17 0 5 32]} it returns
\texttt{[17 17 22 54]}. Describe (in detail) how to implement \emph{prefix-sum}
in MapReduce, where the input is stored as $\langle i, a_i \rangle$. That is,
the key is the position in the array, and the value is the value at that position.
Analyze the shuffle size and the reduce-key space and time complexity.
\item \textbf{Shallow Graphs} For an undirected graph $G = (V,E)$ with $n$
vertices and $m$ edges $(m \geq n)$, we say that $G$ is shallow
if for every pair of vertices $u,v \in V$, there is a path from $u$
to $v$ of length at most 2 (i.e. using at most two edges).
\begin{enumerate} \item[(a)] Give an algorithm that can decide whether $G$ is shallow in $O(n^{2.376})$ time.
\item[(b)] Given an $n\times r$ matrix $A$ and an $r \times n$ matrix $B$ where
$r \leq n$, show that we can multiply $A$ and $B$ in
$O\left( (n/r)^{2} r^{2.376}\right)$ time. Hint: use the fact that we can multiply
two $r \times r$ matrices in $O(r^{2.376})$ time.
\item[(c)] Give an algorithm that can decide whether $G$ is shallow in
$O(m^{0.55} n^{1.45})$ time. Hint: consider length-2 paths that go from low-degree
vertices and length-2 paths that go through high-degree vertices separately.
Use result from part (b). \end{enumerate}
\end{enumerate}
\end{document}