import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import numpy as np
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

plt.set_loglevel("warning") # Setze den Log-Level für Matplotlib auf "warning"

# Einlesen der CSV-Datei
file_path = './bikedata/sampled_data_004.csv'
data = pd.read_csv(file_path, sep=";", decimal=",")

print(data.isna().sum())

Distance           0
Elapsed Time       0
Elevation High     0
Elevation Gain     0
Weekday            0
Bike Type         58
dtype: int64

data = data.dropna()

data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189 entries, 2 to 246
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Distance        189 non-null    float64
 1   Elapsed Time    189 non-null    int64  
 2   Elevation High  189 non-null    float64
 3   Elevation Gain  189 non-null    float64
 4   Weekday         189 non-null    object 
 5   Bike Type       189 non-null    object 
dtypes: float64(3), int64(1), object(2)
memory usage: 10.3+ KB

data.describe()

print(data['Bike Type'].value_counts())

Bike Type
race bike        105
trecking bike     61
mtb               23
Name: count, dtype: int64

print(data['Bike Type'].value_counts(normalize=True) * 100)

Bike Type
race bike        55.555556
trecking bike    32.275132
mtb              12.169312
Name: proportion, dtype: float64

grouped_data = data.groupby('Bike Type')
for bike_type, group in grouped_data:
    print(f"Descriptive statistics for {bike_type}:\n")
    print(group.describe())
    print("\n" + "-"*70 + "\n")

Descriptive statistics for mtb:

        Distance  Elapsed Time  Elevation High  Elevation Gain
count  23.000000     23.000000       23.000000       23.000000
mean   17.604348   4560.826087      384.743478      214.761739
std     9.200963   4421.963927      461.970185      121.670654
min     4.250000    952.000000     -120.600000       13.800000
25%    10.955000   2453.500000      170.400000      142.795000
50%    15.640000   3246.000000      359.000000      216.490000
75%    19.710000   4544.000000      413.300000      241.000000
max    40.270000  22875.000000     2279.900000      494.110000

----------------------------------------------------------------------

Descriptive statistics for race bike:

         Distance  Elapsed Time  Elevation High  Elevation Gain
count  105.000000    105.000000      105.000000      105.000000
mean    48.052571   7949.876190      542.256190      588.882000
std     32.171612   6251.371558      352.345925      567.584251
min      3.230000   1233.000000       12.400000       17.230000
25%     30.620000   4825.000000      454.800000      316.000000
50%     36.570000   5909.000000      497.600000      479.000000
75%     51.330000   8364.000000      525.200000      615.000000
max    219.320000  45073.000000     2766.800000     4838.000000

----------------------------------------------------------------------

Descriptive statistics for trecking bike:

         Distance  Elapsed Time  Elevation High  Elevation Gain
count   61.000000     61.000000       61.000000       61.000000
mean    35.706066   6123.131148      343.008197      183.400984
std     17.057845   2981.041532       64.974878      182.905175
min      4.090000   1851.000000       64.600000       48.200000
25%     34.890000   5374.000000      315.400000      123.000000
50%     35.340000   5719.000000      331.000000      144.000000
75%     35.890000   5999.000000      380.200000      161.000000
max    148.490000  21426.000000      543.000000     1444.000000

----------------------------------------------------------------------

# Numerische Attribute - einzelne Boxplots
numeric_columns = data.select_dtypes(include='number').columns

for column in numeric_columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(y=data[column])
    plt.title(f'Boxplot der numerischen Attributs: {column}')
    plt.ylabel(column)
    plt.grid(True)
    plt.show()

# Boxplots zur Analyse von Bike Type vs. Leistungswerte
plt.figure(figsize=(12, 8))
for i, col in enumerate(['Distance', 'Elapsed Time', 'Elevation High', 'Elevation Gain'], 1):
    plt.subplot(2, 2, i)
    sns.boxplot(x=data['Bike Type'], y=data[col])
    plt.title(f'Bike Type vs. {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Boxplots zur Analyse von Weekday vs. Leistungswerte
plt.figure(figsize=(12, 8))
for i, col in enumerate(['Distance', 'Elapsed Time', 'Elevation High', 'Elevation Gain'], 1):
    plt.subplot(2, 2, i)
    sns.boxplot(x=data['Weekday'], y=data[col])
    plt.title(f'Weekday vs. {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Kategoriale Attribute - Häufigkeitsverteilungen
categorical_columns = data.select_dtypes(include='object').columns

for column in categorical_columns:
    plt.figure(figsize=(10, 5))
    data[column].value_counts().plot(kind='bar')
    plt.title(f'Häufigkeitsverteilung von {column}')
    plt.xlabel(column)
    plt.ylabel('Häufigkeit')
    plt.grid(True)
    plt.show()

# Scatterplot für Elevation Gain und Distance basierend auf Bike Type

plt.figure(figsize=(10, 6))
colors = {'race bike': 'red', 'mtb': 'green', 'trecking bike': 'yellow'}
sns.scatterplot(data=data, x='Elevation Gain', y='Distance', hue='Bike Type', palette=colors)
plt.title('Scatterplot von Elevation Gain vs Distance für Bike Type')
plt.xlabel('Elevation Gain')
plt.ylabel('Distance')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Weekday', hue='Bike Type')
plt.title('Anzahl der Fahrten pro Wochentag nach Bike Type')
plt.xlabel('Wochentag')
plt.ylabel('Anzahl')
plt.xticks(rotation=45)
plt.legend(title='Bike Type')
plt.show()

correlation_matrix = data[['Distance', 'Elapsed Time', 'Elevation High', 'Elevation Gain']].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title('Korrelationsmatrix der numerischen Attribute (Alle Daten)')
plt.show()

# Group the data by 'Bike Type' and calculate the correlation matrix for each group
for bike_type, group in data.groupby('Bike Type'):
    correlation_matrix = group[['Distance', 'Elapsed Time', 'Elevation High', 'Elevation Gain']].corr()
    
    # Plot the heatmap for the correlation matrix
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
    plt.title(f'Korrelationsmatrix der numerischen Attribute ({bike_type})')
    plt.show()

model = BayesianNetwork([
    ('Distance', 'Bike Type'),
    ('Elapsed Time', 'Bike Type'),
    ('Elevation High', 'Bike Type'),
    ('Elevation Gain', 'Bike Type'),
    ('Weekday', 'Bike Type'),
])

print(pd.cut(data['Elapsed Time'],bins=[0, 5666, 6810, np.inf], labels=['low', 'medium', 'high']).value_counts(normalize=True).sort_index())

Elapsed Time
low       0.502646
medium    0.248677
high      0.248677
Name: proportion, dtype: float64

# Wir erhalten:
# low      ~0.5026
# medium   ~0.2487
# high     ~0.2487

# Dann ist die CPT wie folgt:
cpd_elapsed_time = TabularCPD(
    variable='Elapsed Time',
    variable_card=3,
    values=[
        [0.5026],    # P(low)
        [0.2487],   # P(medium)
        [0.2487]    # P(high)
    ],
    state_names={'Elapsed Time': ['low', 'medium', 'high']})

print(pd.cut(data['Distance'], bins=[0, 35.25 , 41.89, np.inf], labels=['short', 'medium', 'long']).value_counts(normalize=True).sort_index())

Distance
short     0.502646
medium    0.248677
long      0.248677
Name: proportion, dtype: float64

# Wir erhalten:
# short      ~0.5026
# medium   ~0.2487
# long     ~0.2487

# Dann ist die CPT wie folgt:
cpd_distance = TabularCPD(
    variable='Distance',
    variable_card=3,
    values=[
        [0.5026],    # P(short)
        [0.2487],   # P(medium)
        [0.2487]    # P(long)
    ],
    state_names={'Distance': ['short', 'medium', 'long']}
)

print(pd.cut(data['Elevation High'], bins=[-np.inf, 416.2, np.inf], labels=['low', 'high']).value_counts(normalize=True).sort_index())

Elevation High
low     0.502646
high    0.497354
Name: proportion, dtype: float64

# Wir erhalten:
# low     ~0.5026
# high    ~0.4974

# Dann ist die CPT wie folgt:
cpd_elevation_high = TabularCPD(
    variable='Elevation High',
    variable_card=2,
    values=[
        [0.5026],  # P(low)
        [0.4974]   # P(high)
    ],
    state_names={'Elevation High': ['low', 'high']}
)

print(pd.cut(data['Elevation Gain'], bins=[0, 279, 517, np.inf], labels=['low', 'medium', 'high']).value_counts(normalize=True).sort_index())

Elevation Gain
low       0.502646
medium    0.248677
high      0.248677
Name: proportion, dtype: float64

# Wir erhalten:
# low     ~0.5026
# medium  ~0.2487
# high    ~0.2487

# Dann ist die CPT wie folgt:

cpd_elevation_gain = TabularCPD(
    variable='Elevation Gain',
    variable_card=3,
    values=[
        [0.5026],   # P(low)
        [0.2487],   # P(medium)
        [0.2487]    # P(high)
    ],
    state_names={'Elevation Gain': ['low', 'medium', 'high']}
)

print(data['Weekday'].apply(lambda x: 'weekend' if x in ['Saturday', 'Sunday'] else 'weekday').value_counts(normalize=True).sort_index())

Weekday
weekday    0.698413
weekend    0.301587
Name: proportion, dtype: float64

# Wir erhalten:
# weekday    ~0.6984
# weekend    ~0.3016

# Dann ist die CPT wie folgt:
cpd_weekday = TabularCPD(
    variable='Weekday',
    variable_card=2,
    values=[
        [0.6984],   # P(weekday)
        [0.3016]    # P(weekend)
    ],
    state_names={'Weekday': ['weekday', 'weekend']})

# Wir erstellen eine neue DataFrame-Kopie mit allen diskretisierten Merkmalen
df_cpt = data.copy()

# Diskretisierung (wie oben schon verwendet)
df_cpt['Elapsed Time'] = pd.cut(df_cpt['Elapsed Time'], bins=[0, 5666, 6810, np.inf], labels=['low', 'medium', 'high'])
df_cpt['Distance'] = pd.cut(df_cpt['Distance'], bins=[0, 35.25, 41.89, np.inf], labels=['short', 'medium', 'long'])
df_cpt['Elevation High'] = pd.cut(df_cpt['Elevation High'], bins=[-np.inf, 416.2, np.inf], labels=['low', 'high'])
df_cpt['Elevation Gain'] = pd.cut(df_cpt['Elevation Gain'], bins=[0, 279, 517, np.inf], labels=['low', 'medium', 'high'])
df_cpt['Weekday'] = df_cpt['Weekday'].apply(lambda x: 'weekend' if x in ['Saturday', 'Sunday'] else 'weekday')

# Jetzt gruppieren wir nach allen Eltern und zählen die Bike Types
grouped = df_cpt.groupby(['Elapsed Time', 'Distance', 'Elevation High', 'Elevation Gain', 'Weekday', 'Bike Type'], observed=False).size()
grouped = grouped.unstack(fill_value=0)

cpt_bike_type = grouped.div(grouped.sum(axis=1), axis=0)

# Elternvariablen in folgender Reihenfolge:
# ['Distance', 'Elapsed Time', 'Elevation High', 'Elevation Gain', 'Weekday']

# Anzahl der Zustände je Variable
# Elapsed Time: 3 (low, medium, high)
# Distance: 3  (short, medium, long)
# Elevation High: 2 (low, high)
# Elevation Gain: 3 (low, medium, high)
# Weekday: 2 (weekday, weekend)

# Bike Type (Zielvariable): 3 Klassen -> ['mtb', 'race bike', 'trecking bike']

# Also:
evidence = ['Elapsed Time', 'Distance', 'Elevation High', 'Elevation Gain', 'Weekday']
evidence_card = [3, 3, 2, 3, 2]

# Sicherstellen, dass alle Kombinationen vorhanden sind

all_states = pd.MultiIndex.from_product([
    ['low', 'medium', 'high'],
    ['short', 'medium', 'long'],
    ['low', 'high'],
    ['low', 'medium', 'high'],
    ['weekday', 'weekend']
], names=evidence)

# Erinnerung (aus EDA) an die Verteilung (in %) der Bike Types:

# race bike        55.555556
# trecking bike    32.275132
# mtb              12.169312
                                        
# Auffüllen der fehlenden Werte mit den Wahrscheinlichkeiten
cpt_complete = cpt_bike_type.reindex(all_states).apply(
    lambda row: row.fillna(pd.Series([0.1217, 0.5556, 0.3228], index=row.index)),
    axis=1
)

# Klassen von Bike Type
bike_type_states = ['mtb', 'race bike', 'trecking bike']

# Extrahiere als Liste von Listen (je eine Liste pro Bike Type, transponiert für pgmpy)
cpt_values = [
    cpt_complete[bike_type].values.tolist() for bike_type in bike_type_states
]

cpd_bike_type = TabularCPD(
    variable='Bike Type',
    variable_card=3,
    values=cpt_values,
    evidence=evidence,
    evidence_card=evidence_card,
    state_names={
        'Bike Type': bike_type_states,
        'Elapsed Time': ['low', 'medium', 'high'],
        'Distance': ['short', 'medium', 'long'],
        'Elevation High': ['low', 'high'],
        'Elevation Gain': ['low', 'medium', 'high'],
        'Weekday': ['weekday', 'weekend']
    }
)

# Alle definierten CPTs in das Modell einfügen
model.add_cpds(
    cpd_distance,
    cpd_elapsed_time,
    cpd_elevation_high,
    cpd_elevation_gain,
    cpd_weekday,
    cpd_bike_type 
)

# Prüfen, ob das Modell korrekt ist
assert model.check_model(), "Modell ist nicht konsistent!"
print("Bayes-Netzwerk erfolgreich aufgebaut und validiert.")

Bayes-Netzwerk erfolgreich aufgebaut und validiert.

# Inferenz-Objekt erstellen
infer = VariableElimination(model)

print(infer.query(['Distance']))

+------------------+-----------------+
| Distance         |   phi(Distance) |
+==================+=================+
| Distance(short)  |          0.5026 |
+------------------+-----------------+
| Distance(medium) |          0.2487 |
+------------------+-----------------+
| Distance(long)   |          0.2487 |
+------------------+-----------------+

print(infer.query(['Distance'], evidence={'Bike Type': 'race bike'}))

+------------------+-----------------+
| Distance         |   phi(Distance) |
+==================+=================+
| Distance(short)  |          0.5139 |
+------------------+-----------------+
| Distance(medium) |          0.2160 |
+------------------+-----------------+
| Distance(long)   |          0.2701 |
+------------------+-----------------+

# Beispiel
evidence_sample = {
    'Distance': 'long',
    'Elapsed Time': 'high',
    'Elevation High': 'high',
    'Elevation Gain': 'high',
    'Weekday': 'weekend'
}

# Inferenz durchführen
print(infer.query(variables=['Bike Type'], evidence=evidence_sample))

+--------------------------+------------------+
| Bike Type                |   phi(Bike Type) |
+==========================+==================+
| Bike Type(mtb)           |           0.0000 |
+--------------------------+------------------+
| Bike Type(race bike)     |           0.9412 |
+--------------------------+------------------+
| Bike Type(trecking bike) |           0.0588 |
+--------------------------+------------------+

# Vorbereitung: Diskretisierung wie im Modell
test_data = df_cpt.copy()

# Liste zur Speicherung der Vorhersagen
predictions = []

for i, row in test_data.iterrows():
    evidence = {
        'Distance': row['Distance'],
        'Elapsed Time': row['Elapsed Time'],
        'Elevation High': row['Elevation High'],
        'Elevation Gain': row['Elevation Gain'],
        'Weekday': row['Weekday']
    }
    # Inferenz durchführen
    result = infer.query(variables=['Bike Type'], evidence=evidence)
    
    # Vorhersage: die Klasse mit der höchsten Wahrscheinlichkeit
    predicted_class = result.values.argmax()
    predicted_label = result.state_names['Bike Type'][predicted_class]
    predictions.append(predicted_label)

# Tatsächliche Labels
true_labels = test_data['Bike Type'].tolist()

# Bewertung
print("Accuracy:", accuracy_score(true_labels, predictions))
print("\nClassification Report:")
print(classification_report(true_labels, predictions))

Accuracy: 0.8095238095238095

Classification Report:
               precision    recall  f1-score   support

          mtb       0.75      0.13      0.22        23
    race bike       0.94      0.88      0.91       105
trecking bike       0.67      0.95      0.78        61

     accuracy                           0.81       189
    macro avg       0.79      0.65      0.64       189
 weighted avg       0.83      0.81      0.78       189

# Konfusionsmatrix berechnen
cm = confusion_matrix(true_labels, predictions, labels=bike_type_states)

# Konfusionsmatrix anzeigen
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=bike_type_states)
disp.plot(cmap='Blues', xticks_rotation=45)
plt.title("Konfusionsmatrix")
plt.show()

discretized_data = data[['Elapsed Time', 'Distance', 'Elevation High', 'Elevation Gain', 'Weekday', 'Bike Type']].copy()

# Diskretisierung der numerischen Spalten wie in Aufgabe 1
discretized_data['Elapsed Time'] = pd.cut(discretized_data['Elapsed Time'], bins=[0, 5666, 6810, np.inf], labels=['low', 'medium', 'high'])
discretized_data['Distance'] = pd.cut(discretized_data['Distance'], bins=[0, 35.25, 41.89, np.inf], labels=['short', 'medium', 'long'])
discretized_data['Elevation High'] = pd.cut(discretized_data['Elevation High'], bins=[-np.inf, 416.2, np.inf], labels=['low', 'high'])
discretized_data['Elevation Gain'] = pd.cut(discretized_data['Elevation Gain'], bins=[0, 279, 517, np.inf], labels=['low', 'medium', 'high'])
discretized_data['Weekday'] = discretized_data['Weekday'].apply(lambda x: 'weekend' if x in ['Saturday', 'Sunday'] else 'weekday')

def aehnlichkeit(fall1, fall2):
    """
    Berechnet die Ähnlichkeit zwischen zwei Fällen anhand diskreter Merkmalsausprägungen.
    Es wird ein gewichteter Matching-Koeffizient verwendet.

    Args:
        fall1 (dict): Ein Fall als Dictionary mit diskretisierten Merkmalen.
        fall2 (dict): Ein Fall als Dictionary mit diskretisierten Merkmalen.

    Returns:
        float: Ein Wert zwischen 0 und 1, der die Ähnlichkeit angibt.
    """

    # Merkmale, die für den Vergleich berücksichtigt werden sowie die Gewichte für die einzelnen Merkmale
    merkmale_und_gewichte = {
        'Elapsed Time': 0.5,
        'Distance': 0.25,
        'Elevation High': 0.25,
        'Elevation Gain': 0.25,
        'Weekday': 0.2
    }

    score = 0.0
    gesamtgewicht = sum(merkmale_und_gewichte.values())
    
    for merkmal, gewicht in merkmale_und_gewichte.items():
        # Falls beide Fälle im jeweiligen Merkmal übereinstimmen, wird das Gewicht addiert.
        if fall1.get(merkmal) == fall2.get(merkmal):
            score += gewicht

    # Normierung auf den Bereich [0,1]
    return score / gesamtgewicht

# Wir nutzen die zuvor diskretisierten Daten
fall1 = discretized_data.iloc[0].to_dict()
fall2 = discretized_data.iloc[1].to_dict()

print(fall1)
print(fall2)

print("Ähnlichkeit: ", aehnlichkeit(fall1, fall2))

{'Elapsed Time': 'high', 'Distance': 'medium', 'Elevation High': 'high', 'Elevation Gain': 'medium', 'Weekday': 'weekend', 'Bike Type': 'mtb'}
{'Elapsed Time': 'low', 'Distance': 'short', 'Elevation High': 'high', 'Elevation Gain': 'low', 'Weekday': 'weekday', 'Bike Type': 'race bike'}
Ähnlichkeit:  0.1724137931034483

def finde_aehnlichsten_fall(df, person_zu_vergleichen):
    """
    Findet den Fall im DataFrame, der den höchsten Ähnlichkeitswert zur gegebenen Person hat.

    Args:
        df (pd.DataFrame): DataFrame mit den diskretisierten Fahrtdaten und zugehörigen Labels.
        person_zu_vergleichen (dict): Merkmalswerte des neuen Falls.

    Returns:
        tuple: (index, höchster Ähnlichkeitswert, Fall als Series)
    """
    # Initialisierung der Variablen
    max_aehnlichkeit = -1 # Der Wert max_aehnlichkeit wird mit -1 initialisiert, um sicherzustellen, dass jede berechnete Ähnlichkeit (die immer positiv ist) den Anfangswert überschreitet und der erste Vergleich immer wahr ist.
    index_aehnlichster_fall = -1 # Platzhalter für den Index des ähnlichsten Falls
    aehnlichster_datensatz = None

    for i, row in df.iterrows():
        fall = row.to_dict()
        # Berechnung der Ähnlichkeit basierend auf den definierten Merkmalen
        a = aehnlichkeit(person_zu_vergleichen, fall)
        if a > max_aehnlichkeit:
            max_aehnlichkeit = a
            index_aehnlichster_fall = i
            aehnlichster_datensatz = row

    return index_aehnlichster_fall, max_aehnlichkeit, aehnlichster_datensatz

fall_fiktiv = {'Elapsed Time': 'high', 'Distance': 'long', 'Elevation High': 'high',
            'Elevation Gain': 'medium', 'Weekday': 'weekday'}

idx, sim, retrieved = finde_aehnlichsten_fall(discretized_data, fall_fiktiv)
print("Index:", idx)
print("Ähnlichkeit:", sim)
print("Ähnlichster Datensatz:\n", retrieved)

bike_type = retrieved['Bike Type']
print(f"Daher sollte das Fahrrad vom Type {bike_type} sein.")

Index: 42
Ähnlichkeit: 1.0
Ähnlichster Datensatz:
 Elapsed Time           high
Distance               long
Elevation High         high
Elevation Gain       medium
Weekday             weekday
Bike Type         race bike
Name: 42, dtype: object
Daher sollte das Fahrrad vom Type race bike sein.

cases = pd.DataFrame(columns=['Elapsed Time', 'Distance', 'Elevation High', 'Elevation Gain', 'Weekday', 'Bike Type'])

# Füge den ersten Eintrag von data zu cases hinzu (initialisieren)
initial = discretized_data.iloc[0]
cases = pd.concat([cases, initial.to_frame().T], ignore_index=True)

for index, row in discretized_data.iterrows():
    # Finde den ähnlichsten Fall in "cases"
    i, m, case = finde_aehnlichsten_fall(cases, row)

    # Vergleiche den Bike Type des gefundenen Falls mit dem Bike Type des aktuellen Falls
    if case['Bike Type'] != row['Bike Type']:
        # Füge den Datensatz zu "cases" hinzu
        cases = pd.concat([cases, row.to_frame().T], ignore_index=True)

print(f"Der Datensatz der Größe {len(discretized_data)} wurde in {len(cases)} Fällen gespeichert.")

Der Datensatz der Größe 189 wurde in 66 Fällen gespeichert.

def treffer(discretized_data, cases):
    """
    Überprüft, wie oft finde_aehnlichsten_fall den korrekten Bike Type findet.

    Args:
        data: Der DataFrame, dessen Elemente geprüft werden sollen.
        cases: Der DataFrame, in dem nach ähnlichen Fällen gesucht wird.
        
    Returns:
        int: Anzahl der korrekten Übereinstimmungen.
    """

    korrekte_uebereinstimmungen = 0
    for index, row in discretized_data.iterrows():
        _, _, case = finde_aehnlichsten_fall(cases, row)
        if case['Bike Type'] == row['Bike Type']:
            korrekte_uebereinstimmungen += 1
        else:
            print(f"Fehler in Datensatz {index}")
            
    return korrekte_uebereinstimmungen

anzahl_korrekte_uebereinstimmungen = treffer(discretized_data, cases)

print("Anzahl korrekter Übereinstimmungen:", anzahl_korrekte_uebereinstimmungen)

Fehler in Datensatz 7
Fehler in Datensatz 10
Fehler in Datensatz 11
Fehler in Datensatz 16
Fehler in Datensatz 19
Fehler in Datensatz 30
Fehler in Datensatz 50
Fehler in Datensatz 53
Fehler in Datensatz 69
Fehler in Datensatz 75
Fehler in Datensatz 77
Fehler in Datensatz 79
Fehler in Datensatz 80
Fehler in Datensatz 86
Fehler in Datensatz 89
Fehler in Datensatz 94
Fehler in Datensatz 100
Fehler in Datensatz 102
Fehler in Datensatz 104
Fehler in Datensatz 107
Fehler in Datensatz 119
Fehler in Datensatz 124
Fehler in Datensatz 136
Fehler in Datensatz 139
Fehler in Datensatz 147
Fehler in Datensatz 150
Fehler in Datensatz 151
Fehler in Datensatz 162
Fehler in Datensatz 164
Fehler in Datensatz 166
Fehler in Datensatz 177
Fehler in Datensatz 178
Fehler in Datensatz 181
Fehler in Datensatz 182
Fehler in Datensatz 186
Fehler in Datensatz 189
Fehler in Datensatz 193
Fehler in Datensatz 196
Fehler in Datensatz 197
Fehler in Datensatz 198
Fehler in Datensatz 206
Fehler in Datensatz 208
Fehler in Datensatz 209
Fehler in Datensatz 212
Fehler in Datensatz 214
Fehler in Datensatz 220
Fehler in Datensatz 225
Fehler in Datensatz 226
Fehler in Datensatz 235
Fehler in Datensatz 246
Anzahl korrekter Übereinstimmungen: 139

# 1. Vorhersagen erzeugen
y_true = []
y_pred = []

for index, row in discretized_data.iterrows():
    _, _, case = finde_aehnlichsten_fall(cases, row)
    y_true.append(row['Bike Type'])
    y_pred.append(case['Bike Type'])

# 2. Report erstellen
report = classification_report(y_true, y_pred)
print(report)

               precision    recall  f1-score   support

          mtb       0.28      0.30      0.29        23
    race bike       0.91      0.78      0.84       105
trecking bike       0.68      0.82      0.74        61

     accuracy                           0.74       189
    macro avg       0.62      0.63      0.62       189
 weighted avg       0.76      0.74      0.74       189

cm = confusion_matrix(y_true, y_pred, labels=discretized_data['Bike Type'].unique())
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=discretized_data['Bike Type'].unique())
disp.plot(cmap='Blues', xticks_rotation=45)
plt.title("Confusion Matrix für CBR-Klassifikation")
plt.show()

models = ['Bayes-Netz', 'CBR']

# Metriken für die beiden Modelle
accuracy = [0.81, 0.74]
precision = [0.79, 0.62]
recall = [0.65, 0.63]
f1_score = [0.64, 0.62]

x = np.arange(len(models))
width = 0.2

fig, ax = plt.subplots(figsize=(10, 6))

# Positionierung der Balken
x = np.arange(len(models))
width = 0.2

# Balken für jede Metrik hinzufügen
ax.barh(x - 1.5 * width, accuracy, width, label='Accuracy', color='skyblue')
ax.barh(x - 0.5 * width, precision, width, label='Precision (Macro)', color='orange')
ax.barh(x + 0.5 * width, recall, width, label='Recall (Macro)', color='green')
ax.barh(x + 1.5 * width, f1_score, width, label='F1-Score (Macro)', color='purple')

# Achsenbeschriftungen und Titel
ax.set_xlabel('Wert')
ax.set_title('Vergleich der Modelle nach Metriken')
ax.set_yticks(x)
ax.set_yticklabels(models)
ax.set_xlim(0, 1.05)
ax.legend()

# Gridlinien hinzufügen
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

	Distance	Elapsed Time	Elevation High	Elevation Gain
count	189.000000	189.000000	189.000000	189.000000
mean	40.362381	6947.867725	458.780423	412.484603
std	27.906235	5312.384989	322.351340	479.385288
min	3.230000	952.000000	-120.600000	13.800000
25%	28.300000	4697.000000	329.000000	146.000000
50%	35.250000	5666.000000	416.200000	279.000000
75%	41.890000	6810.000000	501.000000	517.000000
max	219.320000	45073.000000	2766.800000	4838.000000

Statistik	Wert
Anzahl	189
Mittelwert	~6948
Standardabweichung	~5312
Minimum	952
25%-Perzentil	4697
Median	5666
75%-Perzentil	6810
Maximum	45073

Statistik	Wert
Anzahl	189
Mittelwert	~40.36
Standardabweichung	~27.91
Minimum	3.23
25%-Perzentil	28.30
Median	35.25
75%-Perzentil	41.89
Maximum	219.32

Statistik	Wert
Anzahl	189
Mittelwert	~458.78
Standardabweichung	~322.35
Minimum	-120.6
25%-Perzentil	329.0
Median	416.2
75%-Perzentil	501.0
Maximum	2766.8

Statistik	Wert
Anzahl	189
Mittelwert	~412.48
Standardabweichung	~479.39
Minimum	13.80
25%-Perzentil	146.00
Median	279.00
75%-Perzentil	517.00
Maximum	4838.00

	Distance	Elapsed Time	Elevation High	Elevation Gain	Weekday	Bike Type
2	40.27	22875	2279.9	494.11	Saturday	mtb
3	3.23	1233	539.0	21.00	Wednesday	race bike
4	35.40	6199	371.0	157.00	Wednesday	trecking bike
5	65.38	10591	530.2	937.00	Saturday	race bike
6	52.38	7940	615.8	721.00	Saturday	race bike

Metrik	Wert
Accuracy	0.81
Precision (Macro Average)	0.79
Recall (Macro Average)	0.65
F1-Score (Macro Average)	0.64

Metrik	Wert
Accuracy	0.74
Precision (Macro Average)	0.62
Recall (Macro Average)	0.63
F1-Score (Macro Average)	0.62

Modell	Accuracy	Precision (Macro)	Recall (Macro)	F1-Score (Macro)
Bayes-Netz	0.81	0.79	0.65	0.64
CBR-Modell	0.74	0.62	0.63	0.62

Laborarbeit Künstliche Intelligenz¶

Thema : Bayes Netze und Case Based Reasoning¶

Namen des/r Studierenden:¶

Setup: Bibliotheken laden und Datensatz importieren¶

Datenbereinigung¶

Suche nach NaN Werten¶

Explorative Datenanalyse¶

Boxplots der numerischen Daten¶

Häufigkeitsverteilungen der kategorialen Variablen¶

Scatterplot von Elevation Gain und Distance nach Bike Type¶

Verteilung der Bike Types über die Wochentage¶

Korrelationsmatrix der numerischen Daten¶

Korrelationsmatrix der numerischen Daten je Bike Type¶

Aufgabenteil 1: Bayes Netze¶

1. Modell anlegen¶

Begründung zum Modell¶

2. CPTs anlegen¶

2.1 Elapsed Time¶

Diskretisierungsregeln:¶

Begründung:¶

Nächster Schritt:¶

Erstellung der CPT für Elapsed Time¶

2.2 Distance¶

Diskretisierungsregeln:¶

Begründung:¶

Nächster Schritt:¶

Erstellung der CPT für Distance¶

2.3 Elevation High¶

Diskretisierungsregeln:¶

Begründung:¶

Nächster Schritt:¶

Erstellung der CPT für Elevation High¶

2.4 Elevantion Gain¶

Diskretisierungsregeln:¶

Begründung:¶

Nächster Schritt:¶

Erstellung der CPT für Elevation Gain¶

2.5 Weekday¶

Diskretisierungsregel:¶

Begründung:¶

Erstellung der CPT für Weekday¶

2.6 Biketype¶

2.6 CPTs zum Modell hinzufügen¶

2.8 Modell überprüfen¶

3. Inferenz¶

4. Test und Bewertung¶

4.1 Test¶

4.2 Diskussion der Ergebnisse¶

Aufgabenteil 2: Case Based Reasoning¶

1. Ähnlichkeitsmaß definieren¶

Begründung zur Wahl des Ähnlichkeitsmaßes¶

2. CBR Zyklus umsetzen¶

3. Test und Bewertung¶

3.1 Test¶

3.2 Diskussion der Ergebnisse¶

Abschluss¶

Bayes-Netz¶

CBR-Modell¶

Vergleich beider Modelle¶

Fazit¶