AA
.. server\file1 server\file2 server\file3
User1 1 2 3
User2 5 7 9
User3 1 7 100
Size: a a a
AA
.. server\file1 server\file2 server\file3
User1 1 2 3
User2 5 7 9
User3 1 7 100
AA
.. vector
User1 [1, 2, 3]
User2 [5, 7, 9]
User3 [1, 7, 100]
AA
AA
AA
AA
PK
PK
AA
from itertools import chain
vals = df_new.select(*(collect_set(c).alias(c) for c in df_new.columns if c != 'User')).collect()[0].asDict().values()
unique_values = set(chain.from_iterable(vals))
PK
PK
AA
AA
SK
SK
AA
SK
AA
SK
AA
from functools import reduce
from pyspark.sql import DataFrame
def unionAll(*dfs):
return reduce(DataFrame.unionAll, dfs)
unionAll(*[df_new.select(c) for c in df_new.columns[1:]]).distinct()